opm-simulators/opm/simulators/linalg/gpuistl/detail/preconditionerKernels/ILU0Kernels.hpp
2024-09-30 16:24:49 +02:00

199 lines
11 KiB
C++

/*
Copyright 2024 SINTEF AS
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef OPM_ILU0_KERNELS_HPP
#define OPM_ILU0_KERNELS_HPP
#include <cstddef>
#include <vector>
namespace Opm::gpuistl::detail::ILU0
{
/**
* @brief Perform a upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param [out] v Will store the results of the upper solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void solveUpperLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
T* v,
int threadBlockSize);
/**
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
* that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void solveLowerLevelSet(T* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const T* d,
T* v,
int threadBlockSize);
/**
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
void solveUpperLevelSetSplit(MatrixScalar* reorderedMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const MatrixScalar* dInv,
LinearSolverScalar* v,
int threadBlockSize);
/**
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly lower triangular
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
* in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
* @param d Stores the defect
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
* solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <int blocksize, class LinearSolverScalar, class MatrixScalar>
void solveLowerLevelSetSplit(MatrixScalar* reorderedLowerMat,
int* rowIndices,
int* colIndices,
int* indexConversion,
int startIdx,
int rowsInLevelSet,
const LinearSolverScalar* d,
LinearSolverScalar* v,
int threadBlockSize);
/**
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
* elements store the inverse of the diagonal entries
* @param [out] reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
* such that rows in the same level sets are contiguous
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param startIdx Index of the first row of the matrix to be solve
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <class T, int blocksize>
void LUFactorization(T* reorderedMat,
int* rowIndices,
int* columnIndices,
int* naturalToReordered,
int* reorderedToNatual,
size_t rowsInLevelSet,
int startIdx,
int threadBlockSize);
/**
* TODO: update this doucmentation
* @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
* triangular part)
* @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
* matrix. The matrix reordered such that rows in the same level sets are contiguous
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
* @param [out] reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
* matrix. The matrix reordered such that rows in the same level sets are contiguous
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
* @param [out] diagonal The diagonal elements of the ILU0 factorization inverted
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
* index in the natural ordered matrix
* @param startIdx Index of the first row of the matrix to be solve
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
* function
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
*/
template <int blocksize, class InputScalar, class OutputScalar>
void LUFactorizationSplit(InputScalar* srcReorderedLowerMat,
int* lowerRowIndices,
int* lowerColIndices,
InputScalar* srcReorderedUpperMat,
int* upperRowIndices,
int* upperColIndices,
InputScalar* srcDiagonal,
OutputScalar* dstReorderedLowerMat,
OutputScalar* dstReorderedUpperMat,
OutputScalar* dstDiagonal,
int* reorderedToNatural,
int* naturalToReordered,
int startIdx,
int rowsInLevelSet,
bool copyResultToOtherMatrix,
int threadBlockSize);
} // namespace Opm::gpuistl::detail::ILU0
#endif