Add an OPM implementation of ILU0

improve file structure in cuistl
run clang-format
This commit is contained in:
Tobias Meyer Andersen 2024-06-19 13:14:59 +02:00
parent ad595fed5e
commit 7a30aaa46e
22 changed files with 2489 additions and 1043 deletions

View File

@ -205,16 +205,21 @@ if (HAVE_CUDA)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg OpmCuILU0.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp)
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
# HEADERS
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp)
@ -223,7 +228,11 @@ if (HAVE_CUDA)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
@ -238,6 +247,8 @@ if (HAVE_CUDA)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/vector_operations.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/has_function.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditioner_should_call_post_pre.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/deviceBlockOperations.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpuThreadUtils.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp)
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp)

View File

@ -24,6 +24,7 @@
#if USE_HIP
#include <opm/simulators/linalg/hipistl/CuBlockPreconditioner.hpp>
#include <opm/simulators/linalg/hipistl/CuDILU.hpp>
#include <opm/simulators/linalg/hipistl/OpmCuILU0.hpp>
#include <opm/simulators/linalg/hipistl/CuJac.hpp>
#include <opm/simulators/linalg/hipistl/CuSeqILU0.hpp>
#include <opm/simulators/linalg/hipistl/PreconditionerAdapter.hpp>
@ -32,6 +33,7 @@
#else
#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
#include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>

View File

@ -349,7 +349,6 @@ struct StandardPreconditioners {
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
// const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
using field_type = typename V::field_type;
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
@ -358,6 +357,18 @@ struct StandardPreconditioners {
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});
F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
using field_type = typename V::field_type;
using OpmCuILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
auto cuilu0 = std::make_shared<OpmCuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
return wrapped;
});
#endif
}
@ -605,6 +616,15 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
std::make_shared<CUJac>(op.getmat(), w));
});
F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
using field_type = typename V::field_type;
using CUILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
});
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
const bool split_matrix = prm.get<bool>("split_matrix", true);
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);

View File

@ -16,110 +16,23 @@
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <chrono>
#include <config.h>
#include <dune/common/fmatrix.hh>
#include <dune/istl/bcrsmatrix.hh>
#include <fmt/core.h>
#include <limits>
#include <opm/common/ErrorMacros.hpp>
#include <opm/common/TimingMacros.hpp>
#include <opm/simulators/linalg/GraphColoring.hpp>
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
#include <opm/simulators/linalg/matrixblock.hh>
#include <vector>
#include <config.h>
#include <chrono>
#include <limits>
#include <tuple>
namespace
{
std::vector<int>
createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
{
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
int globCnt = 0;
for (auto row : levelSets) {
for (auto col : row) {
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
res[globCnt++] = static_cast<int>(col);
}
}
return res;
}
std::vector<int>
createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
{
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
int globCnt = 0;
for (auto row : levelSets) {
for (auto col : row) {
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
res[col] = globCnt++;
}
}
return res;
}
template <class M, class field_type, class GPUM>
void
createReorderedMatrix(const M& naturalMatrix,
std::vector<int>& reorderedToNatural,
std::unique_ptr<GPUM>& reorderedGpuMat)
{
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
// For elements in A
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
dstRowIt.insert(elem.index());
}
}
reorderedGpuMat.reset(new auto (GPUM::fromMatrix(reorderedMatrix, true)));
}
template <class M, class field_type, class GPUM>
void
extractLowerAndUpperMatrices(const M& naturalMatrix,
std::vector<int>& reorderedToNatural,
std::unique_ptr<GPUM>& lower,
std::unique_ptr<GPUM>& upper)
{
const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
lowerIt != reorderedLower.createend();
++lowerIt, ++upperIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
lowerIt.insert(elem.index());
} else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
upperIt.insert(elem.index());
}
}
}
lower.reset(new auto (GPUM::fromMatrix(reorderedLower, true)));
upper.reset(new auto (GPUM::fromMatrix(reorderedUpper, true)));
return;
}
} // NAMESPACE
namespace Opm::cuistl
{
@ -127,8 +40,8 @@ template <class M, class X, class Y, int l>
CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
: m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(createReorderedToNatural(m_levelSets))
, m_naturalToReordered(createNaturalToReordered(m_levelSets))
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
, m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
, m_gpuNaturalToReorder(m_naturalToReordered)
, m_gpuReorderToNatural(m_reorderedToNatural)
@ -154,19 +67,19 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
m_gpuMatrix.nonzeroes(),
A.nonzeroes()));
if (m_splitMatrix) {
m_gpuMatrixReorderedDiag.reset(new auto(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N())));
extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
} else {
createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
m_gpuMatrixReorderedDiag = std::make_unique<CuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
m_reorderedToNatural);
m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
m_cpuMatrix, m_reorderedToNatural);
}
computeDiagAndMoveReorderedData();
// HIP does currently not support automtically picking thread block sizes as well as CUDA
// So only when tuning and using hip should we do our own manual tuning
#ifdef USE_HIP
if (m_tuneThreadBlockSizes){
if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
}
#endif
@ -188,7 +101,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -200,7 +113,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
v.data(),
m_applyThreadBlockSize);
} else {
detail::computeLowerSolveLevelSet<field_type, blocksize_>(
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getRowIndices().data(),
m_gpuMatrixReordered->getColumnIndices().data(),
@ -221,7 +134,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) {
detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
@ -232,7 +145,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
v.data(),
m_applyThreadBlockSize);
} else {
detail::computeUpperSolveLevelSet<field_type, blocksize_>(
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getRowIndices().data(),
m_gpuMatrixReordered->getColumnIndices().data(),
@ -304,7 +217,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::computeDiluDiagonalSplit<field_type, blocksize_>(
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
@ -319,7 +232,8 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
m_gpuDInv.data(),
m_updateThreadBlockSize);
} else {
detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getRowIndices().data(),
m_gpuMatrixReordered->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
@ -345,23 +259,24 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
int bestUpdateBlockSize = -1;
int interval = 64;
//temporary buffers for the apply
// temporary buffers for the apply
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
tmpD = 1;
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval){
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
// sometimes the first kernel launch kan be slower, so take the time twice
for (int i = 0; i < 2; ++i){
for (int i = 0; i < 2; ++i) {
auto beforeUpdate = std::chrono::high_resolution_clock::now();
m_updateThreadBlockSize = thrBlockSize;
update();
std::ignore = cudaDeviceSynchronize();
auto afterUpdate = std::chrono::high_resolution_clock::now();
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
if (durationInMicroSec < bestUpdateTime){
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
long long durationInMicroSec
= std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
if (durationInMicroSec < bestUpdateTime) {
bestUpdateTime = durationInMicroSec;
bestUpdateBlockSize = thrBlockSize;
}
@ -372,9 +287,10 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
apply(tmpV, tmpD);
std::ignore = cudaDeviceSynchronize();
auto afterApply = std::chrono::high_resolution_clock::now();
if (cudaSuccess == cudaGetLastError()){ // kernel launch was valid
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
if (durationInMicroSec < bestApplyTime){
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
long long durationInMicroSec
= std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
if (durationInMicroSec < bestApplyTime) {
bestApplyTime = durationInMicroSec;
bestApplyBlockSize = thrBlockSize;
}

View File

@ -19,17 +19,11 @@
#ifndef OPM_CUDILU_HPP
#define OPM_CUDILU_HPP
#include <dune/istl/preconditioner.hh>
#include <memory>
#include <opm/grid/utility/SparseTable.hpp>
#include <opm/simulators/linalg/GraphColoring.hpp>
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
#include <optional>
#include <vector>
#include <memory>

View File

@ -16,27 +16,13 @@
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cuda.h>
#include <cuda_runtime.h>
#include <cusparse.h>
#include <dune/common/fmatrix.hh>
#include <dune/common/fvector.hh>
#include <dune/istl/bcrsmatrix.hh>
#include <dune/istl/bvector.hh>
#include <fmt/core.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
#include <opm/simulators/linalg/matrixblock.hh>
@ -67,11 +53,7 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
A.nonzeroes()));
// Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getRowIndices().data(),
m_gpuMatrix.getColumnIndices().data(),
m_gpuMatrix.N(),
m_diagInvFlattened.data());
invertDiagonalAndFlatten();
}
template <class M, class X, class Y, int l>
@ -111,7 +93,15 @@ void
CuJac<M, X, Y, l>::update()
{
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
invertDiagonalAndFlatten();
}
template <class M, class X, class Y, int l>
void
CuJac<M, X, Y, l>::invertDiagonalAndFlatten()
{
detail::JAC::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(
m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getRowIndices().data(),
m_gpuMatrix.getColumnIndices().data(),
m_gpuMatrix.N(),

View File

@ -103,6 +103,8 @@ private:
CuSparseMatrix<field_type> m_gpuMatrix;
//! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
CuVector<field_type> m_diagInvFlattened;
void invertDiagonalAndFlatten();
};
} // end namespace Opm::cuistl

View File

@ -0,0 +1,318 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <chrono>
#include <config.h>
#include <dune/common/fmatrix.hh>
#include <dune/istl/bcrsmatrix.hh>
#include <fmt/core.h>
#include <limits>
#include <opm/common/ErrorMacros.hpp>
#include <opm/common/TimingMacros.hpp>
#include <opm/simulators/linalg/GraphColoring.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
#include <opm/simulators/linalg/matrixblock.hh>
#include <tuple>
namespace Opm::cuistl
{
template <class M, class X, class Y, int l>
OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
: m_cpuMatrix(A)
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
, m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
, m_gpuMatrixReorderedLower(nullptr)
, m_gpuMatrixReorderedUpper(nullptr)
, m_gpuNaturalToReorder(m_naturalToReordered)
, m_gpuReorderToNatural(m_reorderedToNatural)
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
, m_splitMatrix(splitMatrix)
, m_tuneThreadBlockSizes(tuneKernels)
{
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
// Some sanity check
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
m_gpuMatrix.blockSize(),
A[0][0].N()));
OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
m_gpuMatrix.dim(),
A.N() * A[0][0].N()));
OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
m_gpuMatrix.nonzeroes(),
A.nonzeroes()));
if (m_splitMatrix) {
m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
m_reorderedToNatural);
} else {
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
m_cpuMatrix, m_reorderedToNatural);
}
LUFactorizeAndMoveData();
#ifdef USE_HIP
if (m_tuneThreadBlockSizes) {
tuneThreadBlockSizes();
}
#endif
}
template <class M, class X, class Y, int l>
void
OpmCuILU0<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
{
}
template <class M, class X, class Y, int l>
void
OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
{
OPM_TIMEBLOCK(prec_apply);
{
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
d.data(),
v.data(),
m_applyThreadBlockSize);
} else {
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
m_gpuReorderedLU->getRowIndices().data(),
m_gpuReorderedLU->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
d.data(),
v.data(),
m_applyThreadBlockSize);
}
levelStartIdx += numOfRowsInLevel;
}
levelStartIdx = m_cpuMatrix.N();
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel;
if (m_splitMatrix) {
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
m_gpuMatrixReorderedDiag.value().data(),
v.data(),
m_applyThreadBlockSize);
} else {
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
m_gpuReorderedLU->getRowIndices().data(),
m_gpuReorderedLU->getColumnIndices().data(),
m_gpuReorderToNatural.data(),
levelStartIdx,
numOfRowsInLevel,
v.data(),
m_applyThreadBlockSize);
}
}
}
}
template <class M, class X, class Y, int l>
void
OpmCuILU0<M, X, Y, l>::post([[maybe_unused]] X& x)
{
}
template <class M, class X, class Y, int l>
Dune::SolverCategory::Category
OpmCuILU0<M, X, Y, l>::category() const
{
return Dune::SolverCategory::sequential;
}
template <class M, class X, class Y, int l>
void
OpmCuILU0<M, X, Y, l>::update()
{
OPM_TIMEBLOCK(prec_update);
{
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
LUFactorizeAndMoveData();
}
}
template <class M, class X, class Y, int l>
void
OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
{
OPM_TIMEBLOCK(prec_update);
{
if (m_splitMatrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getRowIndices().data(),
m_gpuMatrix.getColumnIndices().data(),
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
m_gpuNaturalToReorder.data(),
m_gpuMatrixReorderedLower->N(),
m_updateThreadBlockSize);
} else {
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getRowIndices().data(),
m_gpuReorderedLU->getNonZeroValues().data(),
m_gpuReorderedLU->getRowIndices().data(),
m_gpuNaturalToReorder.data(),
m_gpuReorderedLU->N(),
m_updateThreadBlockSize);
}
int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size();
if (m_splitMatrix) {
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuMatrixReorderedDiag.value().data(),
m_gpuReorderToNatural.data(),
m_gpuNaturalToReorder.data(),
levelStartIdx,
numOfRowsInLevel,
m_updateThreadBlockSize);
} else {
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
m_gpuReorderedLU->getRowIndices().data(),
m_gpuReorderedLU->getColumnIndices().data(),
m_gpuNaturalToReorder.data(),
m_gpuReorderToNatural.data(),
numOfRowsInLevel,
levelStartIdx,
m_updateThreadBlockSize);
}
levelStartIdx += numOfRowsInLevel;
}
}
}
template <class M, class X, class Y, int l>
void
OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
{
// TODO generalize this tuning process in a function separate of the class
long long bestApplyTime = std::numeric_limits<long long>::max();
long long bestUpdateTime = std::numeric_limits<long long>::max();
int bestApplyBlockSize = -1;
int bestUpdateBlockSize = -1;
int interval = 64;
// temporary buffers for the apply
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
tmpD = 1;
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
// sometimes the first kernel launch kan be slower, so take the time twice
for (int i = 0; i < 2; ++i) {
auto beforeUpdate = std::chrono::high_resolution_clock::now();
m_updateThreadBlockSize = thrBlockSize;
update();
std::ignore = cudaDeviceSynchronize();
auto afterUpdate = std::chrono::high_resolution_clock::now();
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
long long durationInMicroSec
= std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
if (durationInMicroSec < bestUpdateTime) {
bestUpdateTime = durationInMicroSec;
bestUpdateBlockSize = thrBlockSize;
}
}
auto beforeApply = std::chrono::high_resolution_clock::now();
m_applyThreadBlockSize = thrBlockSize;
apply(tmpV, tmpD);
std::ignore = cudaDeviceSynchronize();
auto afterApply = std::chrono::high_resolution_clock::now();
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
long long durationInMicroSec
= std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
if (durationInMicroSec < bestApplyTime) {
bestApplyTime = durationInMicroSec;
bestApplyBlockSize = thrBlockSize;
}
}
}
}
m_applyThreadBlockSize = bestApplyBlockSize;
m_updateThreadBlockSize = bestUpdateBlockSize;
}
} // namespace Opm::cuistl
#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \
template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>, \
::Opm::cuistl::CuVector<realtype>, \
::Opm::cuistl::CuVector<realtype>>; \
template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>, \
::Opm::cuistl::CuVector<realtype>, \
::Opm::cuistl::CuVector<realtype>>
INSTANTIATE_CUDILU_DUNE(double, 1);
INSTANTIATE_CUDILU_DUNE(double, 2);
INSTANTIATE_CUDILU_DUNE(double, 3);
INSTANTIATE_CUDILU_DUNE(double, 4);
INSTANTIATE_CUDILU_DUNE(double, 5);
INSTANTIATE_CUDILU_DUNE(double, 6);
INSTANTIATE_CUDILU_DUNE(float, 1);
INSTANTIATE_CUDILU_DUNE(float, 2);
INSTANTIATE_CUDILU_DUNE(float, 3);
INSTANTIATE_CUDILU_DUNE(float, 4);
INSTANTIATE_CUDILU_DUNE(float, 5);
INSTANTIATE_CUDILU_DUNE(float, 6);

View File

@ -0,0 +1,139 @@
/*
Copyright 2022-2023 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_CUILU0_OPM_Impl_HPP
#define OPM_CUILU0_OPM_Impl_HPP
#include <memory>
#include <opm/grid/utility/SparseTable.hpp>
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <optional>
#include <type_traits>
#include <vector>
namespace Opm::cuistl
{
//! \brief ILU0 preconditioner on the GPU.
//!
//! \tparam M The matrix type to operate on
//! \tparam X Type of the update
//! \tparam Y Type of the defect
//! \tparam l Ignored. Just there to have the same number of template arguments
//! as other preconditioners.
//!
//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
//! arguments in case of future additions.
template <class M, class X, class Y, int l = 1>
class OpmCuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
{
public:
//! \brief The matrix type the preconditioner is for.
using matrix_type = typename std::remove_const<M>::type;
//! \brief The domain type of the preconditioner.
using domain_type = X;
//! \brief The range type of the preconditioner.
using range_type = Y;
//! \brief The field type of the preconditioner.
using field_type = typename X::field_type;
//! \brief The GPU matrix type
using CuMat = CuSparseMatrix<field_type>;
//! \brief Constructor.
//!
//! Constructor gets all parameters to operate the prec.
//! \param A The matrix to operate on.
//! \param w The relaxation factor.
//!
explicit OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels);
//! \brief Prepare the preconditioner.
//! \note Does nothing at the time being.
void pre(X& x, Y& b) override;
//! \brief Apply the preconditoner.
void apply(X& v, const Y& d) override;
//! \brief Post processing
//! \note Does nothing at the moment
void post(X& x) override;
//! Category of the preconditioner (see SolverCategory::Category)
Dune::SolverCategory::Category category() const override;
//! \brief Updates the matrix data.
void update() final;
//! \brief Compute LU factorization, and update the data of the reordered matrix
void LUFactorizeAndMoveData();
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
void tuneThreadBlockSizes();
//! \returns false
static constexpr bool shouldCallPre()
{
return false;
}
//! \returns false
static constexpr bool shouldCallPost()
{
return false;
}
private:
//! \brief Reference to the underlying matrix
const M& m_cpuMatrix;
//! \brief size_t describing the dimensions of the square block elements
static constexpr const size_t blocksize_ = matrix_type::block_type::cols;
//! \brief SparseTable storing each row by level
Opm::SparseTable<size_t> m_levelSets;
//! \brief converts from index in reordered structure to index natural ordered structure
std::vector<int> m_reorderedToNatural;
//! \brief converts from index in natural ordered structure to index reordered strucutre
std::vector<int> m_naturalToReordered;
//! \brief The A matrix stored on the gpu, and its reordred version
CuMat m_gpuMatrix;
std::unique_ptr<CuMat> m_gpuReorderedLU;
//! \brief If matrix splitting is enabled, then we store the lower and upper part separately
std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
//! \brief If matrix splitting is enabled, we also store the diagonal separately
std::optional<CuVector<field_type>> m_gpuMatrixReorderedDiag;
//! row conversion from natural to reordered matrix indices stored on the GPU
CuVector<int> m_gpuNaturalToReorder;
//! row conversion from reordered to natural matrix indices stored on the GPU
CuVector<int> m_gpuReorderToNatural;
//! \brief Stores the inverted diagonal that we use in ILU0
CuVector<field_type> m_gpuDInv;
//! \brief Bool storing whether or not we should store matrices in a split format
bool m_splitMatrix;
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
bool m_tuneThreadBlockSizes;
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
//! The default value of -1 indicates that we have not calibrated and selected a value yet
int m_applyThreadBlockSize = -1;
int m_updateThreadBlockSize = -1;
};
} // end namespace Opm::cuistl
#endif

View File

@ -0,0 +1,110 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_COLORING_AND_REORDERING_UTILS_HPP
#define OPM_COLORING_AND_REORDERING_UTILS_HPP
#include <fmt/core.h>
#include <memory>
#include <opm/common/ErrorMacros.hpp>
#include <opm/grid/utility/SparseTable.hpp>
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
#include <tuple>
#include <vector>
/*
This file contains a collection of utility functions used in the GPU implementation of ILU and DILU
The functions deal with creating the mappings between reordered and natural indices, as well as
extracting sparsity structures from dune matrices and creating cusparsematrix indices
*/
namespace Opm::cuistl::detail
{
inline std::vector<int>
createReorderedToNatural(const Opm::SparseTable<size_t>& levelSets)
{
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
int globCnt = 0;
for (auto row : levelSets) {
for (auto col : row) {
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
res[globCnt++] = static_cast<int>(col);
}
}
return res;
}
inline std::vector<int>
createNaturalToReordered(const Opm::SparseTable<size_t>& levelSets)
{
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
int globCnt = 0;
for (auto row : levelSets) {
for (auto col : row) {
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
res[col] = globCnt++;
}
}
return res;
}
template <class M, class field_type, class GPUM>
inline std::unique_ptr<GPUM>
createReorderedMatrix(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
{
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
// For elements in A
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
dstRowIt.insert(elem.index());
}
}
return std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedMatrix, true)));
}
template <class M, class field_type, class GPUM>
inline std::tuple<std::unique_ptr<GPUM>, std::unique_ptr<GPUM>>
extractLowerAndUpperMatrices(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
{
const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
lowerIt != reorderedLower.createend();
++lowerIt, ++upperIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
lowerIt.insert(elem.index());
} else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
upperIt.insert(elem.index());
}
}
}
return {std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedLower, true))),
std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedUpper, true)))};
}
} // namespace Opm::cuistl::detail
#endif

View File

@ -16,425 +16,17 @@
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
#include <stdexcept>
#include <config.h>
namespace Opm::cuistl::detail
{
namespace
{
// TODO: figure out if this can be generalized effectively, this seems excessively verbose
// explicit formulas based on Dune cpu code
template <class T, int blocksize>
__device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
{
if (blocksize == 1) {
dstBlock[0] = 1.0 / (srcBlock[0]);
} else if (blocksize == 2) {
T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
dstBlock[0] = srcBlock[3] * detInv;
dstBlock[1] = -srcBlock[1] * detInv;
dstBlock[2] = -srcBlock[2] * detInv;
dstBlock[3] = srcBlock[0] * detInv;
} else if (blocksize == 3) {
// based on Dune implementation
T t4 = srcBlock[0] * srcBlock[4];
T t6 = srcBlock[0] * srcBlock[5];
T t8 = srcBlock[1] * srcBlock[3];
T t10 = srcBlock[2] * srcBlock[3];
T t12 = srcBlock[1] * srcBlock[6];
T t14 = srcBlock[2] * srcBlock[6];
T t17 = 1.0
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
- t14 * srcBlock[4]); // t17 is 1/determinant
dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
dstBlock[5] = -(t6 - t10) * t17;
dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
dstBlock[8] = (t4 - t8) * t17;
}
}
// explicit formulas based on Dune cpu code
template <class T, int blocksize>
__device__ __forceinline__ void invBlockInPlace(T* __restrict__ block)
{
if (blocksize == 1) {
block[0] = 1.0 / (block[0]);
} else if (blocksize == 2) {
T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
T temp = block[0];
block[0] = block[3] * detInv;
block[1] = -block[1] * detInv;
block[2] = -block[2] * detInv;
block[3] = temp * detInv;
} else if (blocksize == 3) {
T t4 = block[0] * block[4];
T t6 = block[0] * block[5];
T t8 = block[1] * block[3];
T t10 = block[2] * block[3];
T t12 = block[1] * block[6];
T t14 = block[2] * block[6];
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
T t17 = T(1.0) / det;
T matrix01 = block[1];
T matrix00 = block[0];
T matrix10 = block[3];
T matrix11 = block[4];
block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
block[4] = (matrix00 * block[8] - t14) * t17;
block[5] = -(t6 - t10) * t17;
block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
block[7] = -(matrix00 * block[7] - t12) * t17;
block[8] = (t4 - t8) * t17;
}
}
enum class MVType { SET, PLUS, MINUS };
// SET: c = A*b
// PLS: c += A*b
// MINUS: c -= A*b
template <class T, int blocksize, MVType OP>
__device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c)
{
for (int i = 0; i < blocksize; ++i) {
if (OP == MVType::SET) {
c[i] = 0;
}
for (int j = 0; j < blocksize; ++j) {
if (OP == MVType::SET || OP == MVType::PLUS) {
c[i] += A[i * blocksize + j] * b[j];
} else if (OP == MVType::MINUS) {
c[i] -= A[i * blocksize + j] * b[j];
}
}
}
}
template <class T, int blocksize>
__device__ __forceinline__ void mv(const T* a, const T* b, T* c)
{
matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
}
template <class T, int blocksize>
__device__ __forceinline__ void umv(const T* a, const T* b, T* c)
{
matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
}
template <class T, int blocksize>
__device__ __forceinline__ void mmv(const T* a, const T* b, T* c)
{
matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
}
// dst -= A*B*C
template <class T, int blocksize>
__device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst)
{
T tmp[blocksize * blocksize] = {0};
// tmp = A*B
for (int i = 0; i < blocksize; ++i) {
for (int k = 0; k < blocksize; ++k) {
for (int j = 0; j < blocksize; ++j) {
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
}
}
}
// dst = tmp*C
for (int i = 0; i < blocksize; ++i) {
for (int k = 0; k < blocksize; ++k) {
for (int j = 0; j < blocksize; ++j) {
dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
}
}
}
}
template <class T, int blocksize>
__global__ void
cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
{
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
if (row < numberOfRows) {
size_t nnzIdx = rowIndices[row];
size_t nnzIdxLim = rowIndices[row + 1];
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
// zero element
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
++nnzIdx;
}
// diagBlock points to the start of where the diagonal block is stored
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
// element should be stored
T* vecBlock = &vec[blocksize * blocksize * row];
invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
}
}
template <class T, int blocksize>
__global__ void cuComputeLowerSolveLevelSet(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedRowIdx];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuComputeLowerSolveLevelSetSplit(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedRowIdx];
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
// TODO: removce the first condition in the for loop
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuComputeUpperSolveLevelSet(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize] = {0};
for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
const int col = colIndices[block];
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuComputeUpperSolveLevelSetSplit(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedRowIdx];
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize] = {0};
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuComputeDiluDiagonal(T* mat,
int* rowIndices,
int* colIndices,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv)
{
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
const size_t nnzIdx = rowIndices[reorderedRowIdx];
int diagIdx = nnzIdx;
while (colIndices[diagIdx] != naturalRowIdx) {
++diagIdx;
}
T dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
}
}
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
const int col = naturalToReordered[colIndices[block]];
// find element with indices swapped
// Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
// we binary search over
int left = rowIndices[col];
int right = rowIndices[col + 1] - 1;
int mid = left + (right - left) / 2; // overflow-safe average;
while (left <= right) {
const int col = colIndices[mid];
if (col == naturalRowIdx) {
break;
} else if (col < naturalRowIdx) {
left = mid + 1;
} else {
right = mid - 1;
}
mid = left + (right - left) / 2; // overflow-safe average
}
const int symOpposite = mid;
mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&mat[symOpposite * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<T, blocksize>(dInvTmp);
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
}
}
}
}
template <class T, int blocksize>
__global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv)
{
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
T dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
}
}
for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
const int col = naturalToReordered[lowerColIndices[block]];
int symOppositeIdx = upperRowIndices[col];
for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
break;
}
}
const int symOppositeBlock = symOppositeIdx;
mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<T, blocksize>(dInvTmp);
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
}
}
}
}
template <class T, int blocksize>
__global__ void cuMoveDataToReordered(
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows)
@ -505,195 +97,21 @@ namespace
}
}
}
// Kernel here is the function object of the cuda kernel
template <class Kernel>
inline int getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize=-1)
{
if (suggestedThrBlockSize != -1){
return suggestedThrBlockSize;
}
// Use cuda API to maximize occupancy, otherwise we just pick a thread block size if it is not tuned
#if USE_HIP
return 512;
#else
int blockSize = 1024;
int tmpGridSize;
std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
return blockSize;
#endif
}
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
{
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
}
} // namespace
template <class T, int blocksize>
void
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
{
if (blocksize <= 3) {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
cuInvertDiagonalAndFlatten<T, blocksize>
<<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
// perform the lower solve for all rows in the same level set
template <class T, int blocksize>
void
computeLowerSolveLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int thrBlockSize)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeLowerSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
template <class T, int blocksize>
void
computeLowerSolveLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int thrBlockSize)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
// perform the upper solve for all rows in the same level set
template <class T, int blocksize>
void
computeUpperSolveLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeUpperSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
void
computeUpperSolveLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSetSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
void
computeDiluDiagonal(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* reorderedToNatural,
copyMatDataToReordered(T* srcMatrix,
int* srcRowIndices,
T* dstMatrix,
int* dstRowIndices,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv,
size_t numberOfRows,
int thrBlockSize)
{
if (blocksize <= 3) {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonal<T, blocksize>
<<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
rowIndices,
colIndices,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
template <class T, int blocksize>
void
computeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv,
int thrBlockSize)
{
if (blocksize <= 3) {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
lowerRowIndices,
lowerColIndices,
reorderedUpperMat,
upperRowIndices,
upperColIndices,
diagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
template <class T, int blocksize>
void
copyMatDataToReordered(
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows,
int thrBlockSize)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
}
@ -712,8 +130,9 @@ copyMatDataToReorderedSplit(T* srcMatrix,
size_t numberOfRows,
int thrBlockSize)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
srcRowIndices,
srcColumnIndices,
@ -727,16 +146,8 @@ copyMatDataToReorderedSplit(T* srcMatrix,
}
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int); \
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<T, blocksize>( \
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \
template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);

View File

@ -22,189 +22,6 @@
#include <vector>
namespace Opm::cuistl::detail
{
/**
* @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
* @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param numberOfRows Integer describing the number of rows in the matrix
* @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
*/
template <class T, int blocksize>
void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
* reorderedMat
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
*/
template <class T, int blocksize>
void computeLowerSolveLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
* reorderedUpperMat
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
*/
template <class T, int blocksize>
void computeLowerSolveLevelSetSplit(T* reorderedUpperMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
*/
template <class T, int blocksize>
void computeUpperSolveLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
template <class T, int blocksize>
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
*/
void computeUpperSolveLevelSetSplit(T* reorderedUpperMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
/**
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
* containing the diagonal blocks
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
*/
template <class T, int blocksize>
void computeDiluDiagonal(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
T* dInv,
int threadBlockSize);
template <class T, int blocksize>
/**
* @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
* containing the diagonal blocks
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param diagonal The diagonal elements of the reordered matrix
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
*/
void computeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
T* dInv,
int threadBlockSize);
/**
* @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
* @param srcMatrix The source matrix we will copy data from
@ -216,26 +33,40 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
* @param numberOfRows The number of rows in the matrices
*/
template <class T, int blocksize>
void copyMatDataToReordered(
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
void copyMatDataToReordered(T* srcMatrix,
int* srcRowIndices,
T* dstMatrix,
int* dstRowIndices,
int* naturalToReordered,
size_t numberOfRows,
int threadBlockSize);
/**
* @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
* @param srcMatrix The source matrix we will copy data from
* @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
* @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
* @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht bsr
* format
* @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht
* bsr format
* @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
* @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht bsr
* format
* @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht
* bsr format
* @param [out] dstDiag The destination buffer for the diagonal part of the matrix
* @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
* @param numberOfRows The number of rows in the matrices
*/
template <class T, int blocksize>
void copyMatDataToReorderedSplit(
T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
void copyMatDataToReorderedSplit(T* srcMatrix,
int* srcRowIndices,
int* srcColumnIndices,
T* dstLowerMatrix,
int* dstLowerRowIndices,
T* dstUpperMatrix,
int* dstUpperRowIndices,
T* dstDiag,
int* naturalToReordered,
size_t numberOfRows,
int threadBlockSize);
} // namespace Opm::cuistl::detail
#endif

View File

@ -0,0 +1,234 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
#include <config.h>
#include <cuda_runtime.h>
/*
This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
The functions provides various matrix operations that are used by the preconditioners.
*/
namespace
{
// TODO: figure out if this can be generalized effectively, this seems excessively verbose
// explicit formulas based on Dune cpu code
template <class T, int blocksize>
__device__ __forceinline__ void
invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
{
if (blocksize == 1) {
dstBlock[0] = 1.0 / (srcBlock[0]);
} else if (blocksize == 2) {
T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
dstBlock[0] = srcBlock[3] * detInv;
dstBlock[1] = -srcBlock[1] * detInv;
dstBlock[2] = -srcBlock[2] * detInv;
dstBlock[3] = srcBlock[0] * detInv;
} else if (blocksize == 3) {
// based on Dune implementation
T t4 = srcBlock[0] * srcBlock[4];
T t6 = srcBlock[0] * srcBlock[5];
T t8 = srcBlock[1] * srcBlock[3];
T t10 = srcBlock[2] * srcBlock[3];
T t12 = srcBlock[1] * srcBlock[6];
T t14 = srcBlock[2] * srcBlock[6];
T t17 = 1.0
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
- t14 * srcBlock[4]); // t17 is 1/determinant
dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
dstBlock[5] = -(t6 - t10) * t17;
dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
dstBlock[8] = (t4 - t8) * t17;
}
}
// explicit formulas based on Dune cpu code
template <class T, int blocksize>
__device__ __forceinline__ void
invBlockInPlace(T* __restrict__ block)
{
if (blocksize == 1) {
block[0] = 1.0 / (block[0]);
} else if (blocksize == 2) {
T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
T temp = block[0];
block[0] = block[3] * detInv;
block[1] = -block[1] * detInv;
block[2] = -block[2] * detInv;
block[3] = temp * detInv;
} else if (blocksize == 3) {
T t4 = block[0] * block[4];
T t6 = block[0] * block[5];
T t8 = block[1] * block[3];
T t10 = block[2] * block[3];
T t12 = block[1] * block[6];
T t14 = block[2] * block[6];
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
T t17 = T(1.0) / det;
T matrix01 = block[1];
T matrix00 = block[0];
T matrix10 = block[3];
T matrix11 = block[4];
block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
block[4] = (matrix00 * block[8] - t14) * t17;
block[5] = -(t6 - t10) * t17;
block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
block[7] = -(matrix00 * block[7] - t12) * t17;
block[8] = (t4 - t8) * t17;
}
}
enum class MVType { SET, PLUS, MINUS };
// SET: c = A*b
// PLS: c += A*b
// MINUS: c -= A*b
template <class T, int blocksize, MVType OP>
__device__ __forceinline__ void
matrixVectorProductWithAction(const T* A, const T* b, T* c)
{
for (int i = 0; i < blocksize; ++i) {
if (OP == MVType::SET) {
c[i] = 0;
}
for (int j = 0; j < blocksize; ++j) {
if (OP == MVType::SET || OP == MVType::PLUS) {
c[i] += A[i * blocksize + j] * b[j];
} else if (OP == MVType::MINUS) {
c[i] -= A[i * blocksize + j] * b[j];
}
}
}
}
template <class T, int blocksize>
__device__ __forceinline__ void
mv(const T* a, const T* b, T* c)
{
matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
}
template <class T, int blocksize>
__device__ __forceinline__ void
umv(const T* a, const T* b, T* c)
{
matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
}
template <class T, int blocksize>
__device__ __forceinline__ void
mmv(const T* a, const T* b, T* c)
{
matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
}
// dst -= A*B*C
template <class T, int blocksize>
__device__ __forceinline__ void
mmx2Subtraction(T* A, T* B, T* C, T* dst)
{
T tmp[blocksize * blocksize] = {0};
// tmp = A*B
for (int i = 0; i < blocksize; ++i) {
for (int k = 0; k < blocksize; ++k) {
for (int j = 0; j < blocksize; ++j) {
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
}
}
}
// dst = tmp*C
for (int i = 0; i < blocksize; ++i) {
for (int k = 0; k < blocksize; ++k) {
for (int j = 0; j < blocksize; ++j) {
dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
}
}
}
}
// C = A*B, assumes the three buffers do not overlap
template <class T, int blocksize>
__device__ __forceinline__ void
mmNoOverlap(T* A, T* B, T* C)
{
for (int i = 0; i < blocksize; ++i) {
for (int k = 0; k < blocksize; ++k) {
for (int j = 0; j < blocksize; ++j) {
C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
}
}
}
}
// C = A*B, buffers may overlap
template <class T, int blocksize>
__device__ __forceinline__ void
mmOverlap(T* A, T* B, T* C)
{
T tmp[blocksize * blocksize] = {0};
for (int i = 0; i < blocksize; ++i) {
for (int k = 0; k < blocksize; ++k) {
for (int j = 0; j < blocksize; ++j) {
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
}
}
}
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
C[i * blocksize + j] = tmp[i * blocksize + j];
}
}
}
// A -= B
template <class T, int blocksize>
__device__ __forceinline__ void
matrixSubtraction(T* A, T* B)
{
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
A[i * blocksize + j] -= B[i * blocksize + j];
}
}
}
} // namespace
#endif

View File

@ -0,0 +1,66 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_GPU_THREAD_UTILS_HPP
#define OPM_GPU_THREAD_UTILS_HPP
#include <cstddef>
#include <cuda.h>
#include <cuda_runtime.h>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
/*
This file provides some logic for handling how to choose the correct thread-block size
*/
namespace Opm::cuistl::detail
{
constexpr inline size_t
getThreads([[maybe_unused]] size_t numberOfRows)
{
return 1024;
}
inline size_t
getBlocks(size_t numberOfRows)
{
const auto threads = getThreads(numberOfRows);
return (numberOfRows + threads - 1) / threads;
}
// Kernel here is the function object of the cuda kernel
template <class Kernel>
inline int
getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize = -1)
{
if (suggestedThrBlockSize != -1) {
return suggestedThrBlockSize;
}
int blockSize;
int tmpGridSize;
OPM_CUDA_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0));
return blockSize;
}
inline int
getNumberOfBlocks(int wantedThreads, int threadBlockSize)
{
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
}
} // namespace Opm::cuistl::detail
#endif

View File

@ -0,0 +1,437 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
#include <stdexcept>
namespace Opm::cuistl::detail::DILU
{
namespace
{
template <class T, int blocksize>
__global__ void cuSolveLowerLevelSet(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedRowIdx];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuSolveLowerLevelSetSplit(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedRowIdx];
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
// TODO: removce the first condition in the for loop
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuSolveUpperLevelSet(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize] = {0};
for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
const int col = colIndices[block];
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuSolveUpperLevelSetSplit(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v)
{
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedRowIdx];
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
const int naturalRowIdx = indexConversion[reorderedRowIdx];
T rhs[blocksize] = {0};
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuComputeDiluDiagonal(T* mat,
int* rowIndices,
int* colIndices,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv)
{
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
const size_t nnzIdx = rowIndices[reorderedRowIdx];
int diagIdx = nnzIdx;
while (colIndices[diagIdx] != naturalRowIdx) {
++diagIdx;
}
T dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
}
}
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
const int col = naturalToReordered[colIndices[block]];
// find element with indices swapped
// Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
// we binary search over
int left = rowIndices[col];
int right = rowIndices[col + 1] - 1;
int mid;
while (left <= right) {
mid = left + (right - left) / 2; // overflow-safe average
const int col = colIndices[mid];
if (col == naturalRowIdx) {
break;
} else if (col < naturalRowIdx) {
left = mid + 1;
} else {
right = mid - 1;
}
}
const int symOpposite = mid;
mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&mat[symOpposite * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<T, blocksize>(dInvTmp);
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
}
}
}
}
template <class T, int blocksize>
__global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv)
{
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
T dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
}
}
for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
const int col = naturalToReordered[lowerColIndices[block]];
int symOppositeIdx = upperRowIndices[col];
for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
break;
}
}
const int symOppositeBlock = symOppositeIdx;
mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
&dInv[col * blocksize * blocksize],
&reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
dInvTmp);
}
invBlockInPlace<T, blocksize>(dInvTmp);
for (int i = 0; i < blocksize; ++i) {
for (int j = 0; j < blocksize; ++j) {
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
}
}
}
}
} // namespace
// perform the lower solve for all rows in the same level set
template <class T, int blocksize>
void
solveLowerLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
template <class T, int blocksize>
void
solveLowerLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int thrBlockSize)
{
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
}
// perform the upper solve for all rows in the same level set
template <class T, int blocksize>
void
solveUpperLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
void
solveUpperLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
{
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
void
computeDiluDiagonal(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv,
int thrBlockSize)
{
if (blocksize <= 3) {
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
rowIndices,
colIndices,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
template <class T, int blocksize>
void
computeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
T* dInv,
int thrBlockSize)
{
if (blocksize <= 3) {
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
lowerRowIndices,
lowerColIndices,
reorderedUpperMat,
upperRowIndices,
upperColIndices,
diagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet,
dInv);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
template void computeDiluDiagonalSplit<T, blocksize>( \
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
} // namespace Opm::cuistl::detail::DILU

View File

@ -0,0 +1,202 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_DILU_KERNELS_HPP
#define OPM_DILU_KERNELS_HPP
#include <cstddef>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
namespace Opm::cuistl::detail::DILU
{
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
* reorderedMat
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
*/
template <class T, int blocksize>
void solveLowerLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
* reorderedUpperMat
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
*/
template <class T, int blocksize>
void solveLowerLevelSetSplit(T* reorderedUpperMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
*/
template <class T, int blocksize>
void solveUpperLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
*/
template <class T, int blocksize>
void solveUpperLevelSetSplit(T* reorderedUpperMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
/**
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
* containing the diagonal blocks
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
*/
template <class T, int blocksize>
void computeDiluDiagonal(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
T* dInv,
int threadBlockSize);
template <class T, int blocksize>
/**
* @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
* containing the diagonal blocks
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
* matrix. The matrix reordered such that rows in the same level sets are contiguous
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
* matrix. The matrix reordered such that rows in the same level sets are contiguous
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param diagonal The diagonal elements of the reordered matrix
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
*/
void computeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
T* dInv,
int threadBlockSize);
} // namespace Opm::cuistl::detail::DILU
#endif

View File

@ -0,0 +1,476 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
#include <stdexcept>
/*
The LU factorization and apply step is written based on the Dune implementations
*/
namespace Opm::cuistl::detail::ILU0
{
namespace
{
template <class T, int blocksize>
__global__ void cuLUFactorization(T* srcMatrix,
int* srcRowIndices,
int* srcColumnIndices,
int* naturalToReordered,
int* reorderedToNatual,
size_t rowsInLevelSet,
int startIdx)
{
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
constexpr int scalarsInBlock = blocksize * blocksize;
if (reorderedIdx < rowsInLevelSet + startIdx) {
int naturalIdx = reorderedToNatual[reorderedIdx];
// for each element under the diagonal
int endOfRowI = srcRowIndices[reorderedIdx + 1];
int ij;
for (ij = srcRowIndices[reorderedIdx];
ij < srcRowIndices[reorderedIdx + 1] && srcColumnIndices[ij] < naturalIdx;
++ij) {
// consider the block we are looking at to be A_ij
// we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
// already inverted find the A_jj block
int jj; // index in bcsr of the A_jj element
int j = naturalToReordered[srcColumnIndices[ij]];
int startOfRowJ = srcRowIndices[j];
int endOfRowJ = srcRowIndices[j + 1];
for (int blockInRowJ = startOfRowJ; blockInRowJ < endOfRowJ; ++blockInRowJ) {
if (srcColumnIndices[blockInRowJ] == srcColumnIndices[ij]) {
jj = blockInRowJ;
break;
}
}
// the DUNE implementation is inplace, and I need to take care to make sure
// this out of place implementation is equivalent
mmOverlap<T, blocksize>(
&srcMatrix[ij * scalarsInBlock], &srcMatrix[jj * scalarsInBlock], &srcMatrix[ij * scalarsInBlock]);
// we have now accounted for an element under the diagonal and the diagonal element above it
// now iterate over the blocks that align in the
int jk = jj + 1;
int ik = ij + 1;
// this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
while (ik != endOfRowI && jk != endOfRowJ) {
if (srcColumnIndices[ik] == srcColumnIndices[jk]) {
// A_jk = A_ij * A_jk
T tmp[scalarsInBlock] = {0};
mmNoOverlap<T, blocksize>(
&srcMatrix[ij * scalarsInBlock], &srcMatrix[jk * scalarsInBlock], tmp);
matrixSubtraction<T, blocksize>(&srcMatrix[ik * scalarsInBlock], tmp);
++ik;
++jk;
} else {
if (srcColumnIndices[ik] < srcColumnIndices[jk]) {
++ik;
} else {
++jk;
}
}
}
}
invBlockInPlace<T, blocksize>(&srcMatrix[ij * scalarsInBlock]);
}
}
// An index in the BCSR matrix can be in one of three states, above, on, or above the diagonal
enum class POSITION_TYPE { UNDER_DIAG, ON_DIAG, ABOVE_DIAG };
// this handles incrementation of an index on a row that is in a lower/upper split datastructure.
// The point is that if we are in the lower or upper part we increment as usual.
// if we reach the diagonal, we update the state, the index is not needed.
// when we increment from the diagonal, we set it to be the first block in the upper part of that row
__device__ __forceinline__ void
incrementAcrossSplitStructure(int& idx, POSITION_TYPE& currentState, int lastLowerIdx, int firstUpperIdx)
{
if (currentState == POSITION_TYPE::UNDER_DIAG) {
if (idx != lastLowerIdx - 1) { // we should increment to be on the diagonal
++idx;
} else {
currentState = POSITION_TYPE::ON_DIAG;
}
} else if (currentState == POSITION_TYPE::ON_DIAG) {
idx = firstUpperIdx;
currentState = POSITION_TYPE::ABOVE_DIAG;
} else { // currentState = POSITION_TYPE::ABOVE_DIAG
++idx;
}
}
template <class T, int blocksize>
__global__ void cuLUFactorizationSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet)
{
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
constexpr int scalarsInBlock = blocksize * blocksize;
if (reorderedIdx < rowsInLevelSet + startIdx) {
// if (reorderedIdx < rowsInLevelSet){
int naturalIdx = reorderedToNatural[reorderedIdx];
// for each element under the diagonal
int endOfRowILower = lowerRowIndices[reorderedIdx + 1];
int endOfRowIUpper = upperRowIndices[reorderedIdx + 1];
int startOfRowILower = lowerRowIndices[reorderedIdx];
int startOfRowIUpper = upperRowIndices[reorderedIdx];
for (int ij = startOfRowILower; ij < endOfRowILower; ++ij) {
// consider the block we are looking at to be A_ij
// we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
// already inverted find the A_jj block
int j = naturalToReordered[lowerColIndices[ij]];
int endOfRowJ = upperRowIndices[j + 1];
int startOfRowJ = upperRowIndices[j];
// the DUNE implementation is inplace, and I need to take care to make sure
// this out of place implementation is equivalent
mmOverlap<T, blocksize>(&reorderedLowerMat[ij * scalarsInBlock],
&diagonal[j * scalarsInBlock],
&reorderedLowerMat[ij * scalarsInBlock]);
// we have now accounted for an element under the diagonal and the diagonal element above it
// now iterate over the blocks that align in the
int jk = startOfRowJ;
POSITION_TYPE ikState = POSITION_TYPE::UNDER_DIAG; // it is guaranteed that the ij element we consider
// is under the diagonal
int ik = ij;
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
// this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
// checking for ik != endOfRowUpper is not sufficient alone because
// the same block index might be found in the lower part of the matrix
while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
T* ikBlockPtr;
int ikColumn;
if (ikState == POSITION_TYPE::UNDER_DIAG) {
ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock];
ikColumn = lowerColIndices[ik];
} else if (ikState == POSITION_TYPE::ON_DIAG) {
ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock];
ikColumn = naturalIdx;
} else { // ikState == POSITION_TYPE::ABOVE_DIAG
ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock];
ikColumn = upperColIndices[ik];
}
if (ikColumn == upperColIndices[jk]) {
// A_jk = A_ij * A_jk
T tmp[scalarsInBlock] = {0};
mmNoOverlap<T, blocksize>(
&reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp);
matrixSubtraction<T, blocksize>(ikBlockPtr, tmp);
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
;
++jk;
} else {
if (ikColumn < upperColIndices[jk]) {
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
;
} else {
++jk;
}
}
}
}
invBlockInPlace<T, blocksize>(&diagonal[reorderedIdx * scalarsInBlock]);
}
}
template <class T, int blocksize>
__global__ void cuSolveLowerLevelSet(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v)
{
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedIdx];
const int naturalRowIdx = indexConversion[reorderedIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
int block = nnzIdx;
for (; colIndices[block] < naturalRowIdx; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
for (int i = 0; i < blocksize; ++i) {
v[naturalRowIdx * blocksize + i] = rhs[i];
}
}
}
template <class T, int blocksize>
__global__ void cuSolveUpperLevelSet(
T* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, T* v)
{
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
const int naturalRowIdx = indexConversion[reorderedIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = v[naturalRowIdx * blocksize + i];
}
int block = nnzIdxLim - 1;
for (; colIndices[block] > naturalRowIdx; --block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&mat[block * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
template <class T, int blocksize>
__global__ void cuSolveLowerLevelSetSplit(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v)
{
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedIdx];
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
const int naturalRowIdx = indexConversion[reorderedIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = d[naturalRowIdx * blocksize + i];
}
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
for (int i = 0; i < blocksize; ++i) {
v[naturalRowIdx * blocksize + i] = rhs[i];
}
}
}
template <class T, int blocksize>
__global__ void cuSolveUpperLevelSetSplit(T* mat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v)
{
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedIdx < rowsInLevelSet + startIdx) {
const size_t nnzIdx = rowIndices[reorderedIdx];
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
const int naturalRowIdx = indexConversion[reorderedIdx];
T rhs[blocksize];
for (int i = 0; i < blocksize; i++) {
rhs[i] = v[naturalRowIdx * blocksize + i];
}
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
}
mv<T, blocksize>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
}
}
} // namespace
template <class T, int blocksize>
void
solveLowerLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
}
// perform the upper solve for all rows in the same level set
template <class T, int blocksize>
void
solveUpperLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
T* v,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
}
template <class T, int blocksize>
void
solveLowerLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v,
int thrBlockSize)
{
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
}
// perform the upper solve for all rows in the same level set
template <class T, int blocksize>
void
solveUpperLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int thrBlockSize)
{
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
}
template <class T, int blocksize>
void
LUFactorization(T* srcMatrix,
int* srcRowIndices,
int* srcColumnIndices,
int* naturalToReordered,
int* reorderedToNatual,
size_t rowsInLevelSet,
int startIdx,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
}
template <class T, int blocksize>
void
LUFactorizationSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
const int startIdx,
int rowsInLevelSet,
int thrBlockSize)
{
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
lowerRowIndices,
lowerColIndices,
reorderedUpperMat,
upperRowIndices,
upperColIndices,
diagonal,
reorderedToNatural,
naturalToReordered,
startIdx,
rowsInLevelSet);
}
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
template void LUFactorizationSplit<T, blocksize>( \
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int); \
template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
} // namespace Opm::cuistl::detail::ILU0

View File

@ -0,0 +1,193 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ILU0_KERNELS_HPP
#define OPM_ILU0_KERNELS_HPP
#include <cstddef>
#include <vector>
namespace Opm::cuistl::detail::ILU0
{
/**
* @brief Perform a upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param [out] v Will store the results of the upper solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void solveUpperLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
T* v,
int threadBlockSize);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void solveLowerLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void solveUpperLevelSetSplit(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* dInv,
T* v,
int threadBlockSize);
/**
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly lower triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void solveLowerLevelSetSplit(T* reorderedLowerMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
* elements store the inverse of the diagonal entries
* @param [out] reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param startIdx Index of the first row of the matrix to be solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void LUFactorization(T* reorderedMat,
int* rowIndices,
int* columnIndices,
int* naturalToReordered,
int* reorderedToNatual,
size_t rowsInLevelSet,
int startIdx,
int threadBlockSize);
/**
* @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
* triangular part)
* @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
* matrix. The matrix reordered such that rows in the same level sets are contiguous
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
* @param [out] reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
* matrix. The matrix reordered such that rows in the same level sets are contiguous
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
* @param [out] diagonal The diagonal elements of the ILU0 factorization inverted
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void LUFactorizationSplit(T* reorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
T* reorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
T* diagonal,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
int threadBlockSize);
} // namespace Opm::cuistl::detail::ILU0
#endif

View File

@ -0,0 +1,88 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
#include <stdexcept>
namespace Opm::cuistl::detail::JAC
{
namespace
{
template <class T, int blocksize>
__global__ void
cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
{
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
if (row < numberOfRows) {
size_t nnzIdx = rowIndices[row];
size_t nnzIdxLim = rowIndices[row + 1];
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
// zero element
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
++nnzIdx;
}
// diagBlock points to the start of where the diagonal block is stored
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
// element should be stored
T* vecBlock = &vec[blocksize * blocksize * row];
invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
}
}
} // namespace
template <class T, int blocksize>
void
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
{
if (blocksize <= 3) {
int threadBlockSize
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
cuInvertDiagonalAndFlatten<T, blocksize>
<<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
} else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
}
}
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
} // namespace Opm::cuistl::detail::JAC

View File

@ -0,0 +1,38 @@
/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_JAC_KERNELS_HPP
#define OPM_JAC_KERNELS_HPP
#include <cstddef>
#include <vector>
namespace Opm::cuistl::detail::JAC
{
/**
* @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
* @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param numberOfRows Integer describing the number of rows in the matrix
* @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
*/
template <class T, int blocksize>
void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
} // namespace Opm::cuistl::detail::JAC
#endif

View File

@ -16,14 +16,15 @@
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
#include <stdexcept>
#include <config.h>
namespace Opm::cuistl::detail
{
@ -83,20 +84,8 @@ namespace
}
}
constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements)
{
return 1024;
}
inline size_t getBlocks(size_t numberOfElements)
{
const auto threads = getThreads(numberOfElements);
return (numberOfElements + threads - 1) / threads;
}
template <class T>
__global__ void
prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
__global__ void prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
@ -105,8 +94,7 @@ namespace
}
}
template <class T>
__global__ void
syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
__global__ void syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
@ -116,33 +104,13 @@ namespace
}
} // namespace
// Kernel here is the function object of the cuda kernel
template <class Kernel>
inline int getCudaRecomendedThreadBlockSize(Kernel k)
{
#if USE_HIP
return 512;
#else
int blockSize;
int tmpGridSize;
std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
return blockSize;
#endif
}
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
{
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
}
template <class T>
void
setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
setVectorValueKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
deviceData, numberOfElements, value);
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
setVectorValueKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, value);
}
template void setVectorValue(double*, size_t, const double&);
@ -153,10 +121,9 @@ template <class T>
void
setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
setZeroAtIndexSetKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
deviceData, numberOfElements, indices);
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
setZeroAtIndexSetKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, indices);
}
template void setZeroAtIndexSet(double*, size_t, const int*);
template void setZeroAtIndexSet(float*, size_t, const int*);
@ -164,12 +131,16 @@ template void setZeroAtIndexSet(int*, size_t, const int*);
template <class T>
T
innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
innerProductAtIndices(cublasHandle_t cublasHandle,
const T* deviceA,
const T* deviceB,
T* buffer,
size_t numberOfElements,
const int* indices)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
elementWiseMultiplyKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
deviceA, deviceB, buffer, numberOfElements, indices);
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
elementWiseMultiplyKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, deviceB, buffer, numberOfElements, indices);
// TODO: [perf] Get rid of the allocation here.
CuVector<T> oneVector(numberOfElements);
@ -184,11 +155,12 @@ template float innerProductAtIndices(cublasHandle_t, const float*, const float*,
template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int* buffer, size_t, const int*);
template <class T>
void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
void
prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
prepareSendBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
}
template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
@ -196,12 +168,13 @@ template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberO
template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices);
template <class T>
void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
void
syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
//cudaDeviceSynchronize(); // Not needed, I guess...
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
syncFromRecvBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
// cudaDeviceSynchronize(); // Not needed, I guess...
}
template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
template void syncFromRecvBuf(float* deviceA, float* buffer, size_t numberOfElements, const int* indices);
@ -217,30 +190,24 @@ weightedDiagMV(const T* squareBlockVector,
T* dstVec)
{
switch (blocksize) {
case 1:
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
}
break;
case 2:
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
}
break;
case 3:
{
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
}
break;
case 1: {
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
weightedDiagMV<T, 1>
<<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
} break;
case 2: {
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
weightedDiagMV<T, 2>
<<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
} break;
case 3: {
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
weightedDiagMV<T, 3>
<<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
} break;
default:
OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
break;

View File

@ -28,6 +28,7 @@
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
using NumericTypes = boost::mpl::list<double, float>;
@ -87,7 +88,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
Opm::cuistl::detail::invertDiagonalAndFlatten<T, 3>(
Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
std::vector<T> expectedInvDiag {-1.0 / 4.0,
@ -161,7 +162,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
Opm::cuistl::detail::invertDiagonalAndFlatten<T, 2>(
Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};