clang format

This commit is contained in:
Tobias Meyer Andersen 2024-06-18 11:42:00 +02:00
parent 2b9c81fe09
commit 82ff782d5f
3 changed files with 261 additions and 223 deletions

View File

@ -25,9 +25,9 @@
#include <opm/simulators/linalg/cuistl/CuDILU.hpp> #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp> #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp> #include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp> #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp> #include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
#include <opm/simulators/linalg/matrixblock.hh> #include <opm/simulators/linalg/matrixblock.hh>
#include <vector> #include <vector>
@ -65,7 +65,9 @@ createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
template <class M, class field_type, class GPUM> template <class M, class field_type, class GPUM>
void void
createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatural, std::unique_ptr<GPUM>& reorderedGpuMat) createReorderedMatrix(const M& naturalMatrix,
std::vector<int> reorderedToNatural,
std::unique_ptr<GPUM>& reorderedGpuMat)
{ {
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise); M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) { for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
@ -81,22 +83,26 @@ createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatura
template <class M, class field_type, class GPUM> template <class M, class field_type, class GPUM>
void void
extractLowerAndUpperMatrices(const M& naturalMatrix, std::vector<int> reorderedToNatural, std::unique_ptr<GPUM>& lower, std::unique_ptr<GPUM>& upper) extractLowerAndUpperMatrices(const M& naturalMatrix,
std::vector<int> reorderedToNatural,
std::unique_ptr<GPUM>& lower,
std::unique_ptr<GPUM>& upper)
{ {
const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N())/2; const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise); M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise); M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin(); lowerIt != reorderedLower.createend(); ++lowerIt, ++upperIt) { for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
lowerIt != reorderedLower.createend();
++lowerIt, ++upperIt) {
auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()]; auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) { for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
if (elem.index() < srcRow.index()){ // add index to lower matrix if under the diagonal if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
lowerIt.insert(elem.index()); lowerIt.insert(elem.index());
} } else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
else if (elem.index() > srcRow.index()){ // add element to upper matrix if above the diagonal
upperIt.insert(elem.index()); upperIt.insert(elem.index());
} }
} }
@ -144,12 +150,13 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool split_matrix)
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ", fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
m_gpuMatrix.nonzeroes(), m_gpuMatrix.nonzeroes(),
A.nonzeroes())); A.nonzeroes()));
if (m_split_matrix){ if (m_split_matrix) {
m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_*blocksize_*m_cpuMatrix.N())); m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper); extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(
} m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
else{ } else {
createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered); createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
} }
computeDiagAndMoveReorderedData(); computeDiagAndMoveReorderedData();
} }
@ -170,27 +177,28 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
int levelStartIdx = 0; int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) { for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
if (m_split_matrix){ if (m_split_matrix) {
detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(m_gpuMatrixReorderedLower->getNonZeroValues().data(), detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuReorderToNatural.data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),
levelStartIdx, m_gpuReorderToNatural.data(),
numOfRowsInLevel, levelStartIdx,
m_gpuDInv.data(), numOfRowsInLevel,
d.data(), m_gpuDInv.data(),
v.data()); d.data(),
} v.data());
else{ } else {
detail::computeLowerSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(), detail::computeLowerSolveLevelSet<field_type, blocksize_>(
m_gpuMatrixReordered->getRowIndices().data(), m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getColumnIndices().data(), m_gpuMatrixReordered->getRowIndices().data(),
m_gpuReorderToNatural.data(), m_gpuMatrixReordered->getColumnIndices().data(),
levelStartIdx, m_gpuReorderToNatural.data(),
numOfRowsInLevel, levelStartIdx,
m_gpuDInv.data(), numOfRowsInLevel,
d.data(), m_gpuDInv.data(),
v.data()); d.data(),
v.data());
} }
levelStartIdx += numOfRowsInLevel; levelStartIdx += numOfRowsInLevel;
} }
@ -200,25 +208,26 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
for (int level = m_levelSets.size() - 1; level >= 0; --level) { for (int level = m_levelSets.size() - 1; level >= 0; --level) {
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
levelStartIdx -= numOfRowsInLevel; levelStartIdx -= numOfRowsInLevel;
if (m_split_matrix){ if (m_split_matrix) {
detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(m_gpuMatrixReorderedUpper->getNonZeroValues().data(), detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(
m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuReorderToNatural.data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(),
levelStartIdx, m_gpuReorderToNatural.data(),
numOfRowsInLevel, levelStartIdx,
m_gpuDInv.data(), numOfRowsInLevel,
v.data()); m_gpuDInv.data(),
} v.data());
else{ } else {
detail::computeUpperSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(), detail::computeUpperSolveLevelSet<field_type, blocksize_>(
m_gpuMatrixReordered->getRowIndices().data(), m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getColumnIndices().data(), m_gpuMatrixReordered->getRowIndices().data(),
m_gpuReorderToNatural.data(), m_gpuMatrixReordered->getColumnIndices().data(),
levelStartIdx, m_gpuReorderToNatural.data(),
numOfRowsInLevel, levelStartIdx,
m_gpuDInv.data(), numOfRowsInLevel,
v.data()); m_gpuDInv.data(),
v.data());
} }
} }
} }
@ -254,45 +263,45 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
{ {
OPM_TIMEBLOCK(prec_update); OPM_TIMEBLOCK(prec_update);
{ {
if (m_split_matrix){ if (m_split_matrix) {
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(), detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
m_gpuMatrix.getRowIndices().data(), m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getColumnIndices().data(), m_gpuMatrix.getRowIndices().data(),
m_gpuMatrixReorderedLower->getNonZeroValues().data(), m_gpuMatrix.getColumnIndices().data(),
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedDiag.value().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuNaturalToReorder.data(), m_gpuMatrixReorderedDiag.value().data(),
m_gpuMatrixReorderedLower->N()); m_gpuNaturalToReorder.data(),
} m_gpuMatrixReorderedLower->N());
else{ } else {
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(), detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
m_gpuMatrix.getRowIndices().data(), m_gpuMatrix.getRowIndices().data(),
m_gpuMatrixReordered->getNonZeroValues().data(), m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getRowIndices().data(), m_gpuMatrixReordered->getRowIndices().data(),
m_gpuNaturalToReorder.data(), m_gpuNaturalToReorder.data(),
m_gpuMatrixReordered->N()); m_gpuMatrixReordered->N());
} }
int levelStartIdx = 0; int levelStartIdx = 0;
for (int level = 0; level < m_levelSets.size(); ++level) { for (int level = 0; level < m_levelSets.size(); ++level) {
const int numOfRowsInLevel = m_levelSets[level].size(); const int numOfRowsInLevel = m_levelSets[level].size();
if (m_split_matrix){ if (m_split_matrix) {
detail::computeDiluDiagonalSplit<field_type, blocksize_>(m_gpuMatrixReorderedLower->getNonZeroValues().data(), detail::computeDiluDiagonalSplit<field_type, blocksize_>(
m_gpuMatrixReorderedLower->getRowIndices().data(), m_gpuMatrixReorderedLower->getNonZeroValues().data(),
m_gpuMatrixReorderedLower->getColumnIndices().data(), m_gpuMatrixReorderedLower->getRowIndices().data(),
m_gpuMatrixReorderedUpper->getNonZeroValues().data(), m_gpuMatrixReorderedLower->getColumnIndices().data(),
m_gpuMatrixReorderedUpper->getRowIndices().data(), m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
m_gpuMatrixReorderedUpper->getColumnIndices().data(), m_gpuMatrixReorderedUpper->getRowIndices().data(),
m_gpuMatrixReorderedDiag.value().data(), m_gpuMatrixReorderedUpper->getColumnIndices().data(),
m_gpuReorderToNatural.data(), m_gpuMatrixReorderedDiag.value().data(),
m_gpuNaturalToReorder.data(), m_gpuReorderToNatural.data(),
levelStartIdx, m_gpuNaturalToReorder.data(),
numOfRowsInLevel, levelStartIdx,
m_gpuDInv.data()); numOfRowsInLevel,
} m_gpuDInv.data());
else{ } else {
detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(), detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
m_gpuMatrixReordered->getRowIndices().data(), m_gpuMatrixReordered->getRowIndices().data(),
m_gpuMatrixReordered->getColumnIndices().data(), m_gpuMatrixReordered->getColumnIndices().data(),

View File

@ -229,14 +229,14 @@ namespace
template <class T, int blocksize> template <class T, int blocksize>
__global__ void cuComputeLowerSolveLevelSetSplit(T* mat, __global__ void cuComputeLowerSolveLevelSetSplit(T* mat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const T* dInv, const T* dInv,
const T* d, const T* d,
T* v) T* v)
{ {
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) { if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -250,7 +250,7 @@ namespace
rhs[i] = d[naturalRowIdx * blocksize + i]; rhs[i] = d[naturalRowIdx * blocksize + i];
} }
//TODO: removce the first condition in the for loop // TODO: removce the first condition in the for loop
for (int block = nnzIdx; block < nnzIdxLim; ++block) { for (int block = nnzIdx; block < nnzIdxLim; ++block) {
const int col = colIndices[block]; const int col = colIndices[block];
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs); mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
@ -288,13 +288,13 @@ namespace
template <class T, int blocksize> template <class T, int blocksize>
__global__ void cuComputeUpperSolveLevelSetSplit(T* mat, __global__ void cuComputeUpperSolveLevelSetSplit(T* mat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const T* dInv, const T* dInv,
T* v) T* v)
{ {
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x); const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
if (reorderedRowIdx < rowsInLevelSet + startIdx) { if (reorderedRowIdx < rowsInLevelSet + startIdx) {
@ -381,23 +381,23 @@ namespace
template <class T, int blocksize> template <class T, int blocksize>
__global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat, __global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
int* lowerColIndices, int* lowerColIndices,
T* reorderedUpperMat, T* reorderedUpperMat,
int* upperRowIndices, int* upperRowIndices,
int* upperColIndices, int* upperColIndices,
T* diagonal, T* diagonal,
int* reorderedToNatural, int* reorderedToNatural,
int* naturalToReordered, int* naturalToReordered,
const int startIdx, const int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
T* dInv) T* dInv)
{ {
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x; const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
if (reorderedRowIdx < rowsInLevelSet + startIdx) { if (reorderedRowIdx < rowsInLevelSet + startIdx) {
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx]; const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx]; const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx+1]; const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
T dInvTmp[blocksize * blocksize]; T dInvTmp[blocksize * blocksize];
for (int i = 0; i < blocksize; ++i) { for (int i = 0; i < blocksize; ++i) {
@ -410,8 +410,8 @@ namespace
const int col = naturalToReordered[lowerColIndices[block]]; const int col = naturalToReordered[lowerColIndices[block]];
int symOppositeIdx = upperRowIndices[col]; int symOppositeIdx = upperRowIndices[col];
for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx){ for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
if (naturalRowIdx == upperColIndices[symOppositeIdx]){ if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
break; break;
} }
} }
@ -457,15 +457,23 @@ namespace
} }
template <class T, int blocksize> template <class T, int blocksize>
__global__ void cuMoveDataToReorderedSplit( __global__ void cuMoveDataToReorderedSplit(T* srcMatrix,
T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows) int* srcRowIndices,
int* srcColumnIndices,
T* dstLowerMatrix,
int* dstLowerRowIndices,
T* dstUpperMatrix,
int* dstUpperRowIndices,
T* dstDiag,
int* naturalToReordered,
size_t numberOfRows)
{ {
const auto srcRow = blockDim.x * blockIdx.x + threadIdx.x; const auto srcRow = blockDim.x * blockIdx.x + threadIdx.x;
if (srcRow < numberOfRows) { if (srcRow < numberOfRows) {
const auto dstRow = naturalToReordered[srcRow]; const auto dstRow = naturalToReordered[srcRow];
const auto rowStart = srcRowIndices[srcRow]; const auto rowStart = srcRowIndices[srcRow];
const auto rowEnd = srcRowIndices[srcRow+1]; const auto rowEnd = srcRowIndices[srcRow + 1];
auto lowerBlock = dstLowerRowIndices[dstRow]; auto lowerBlock = dstLowerRowIndices[dstRow];
auto upperBlock = dstUpperRowIndices[dstRow]; auto upperBlock = dstUpperRowIndices[dstRow];
@ -474,17 +482,16 @@ namespace
int dstBlock; int dstBlock;
T* dstBuffer; T* dstBuffer;
if (srcColumnIndices[srcBlock] < srcRow){ // we are writing a value to the lower triangular matrix if (srcColumnIndices[srcBlock] < srcRow) { // we are writing a value to the lower triangular matrix
dstBlock = lowerBlock; dstBlock = lowerBlock;
++lowerBlock; ++lowerBlock;
dstBuffer = dstLowerMatrix; dstBuffer = dstLowerMatrix;
} } else if (srcColumnIndices[srcBlock]
else if (srcColumnIndices[srcBlock] > srcRow){ // we are writing a value to the upper triangular matrix > srcRow) { // we are writing a value to the upper triangular matrix
dstBlock = upperBlock; dstBlock = upperBlock;
++upperBlock; ++upperBlock;
dstBuffer = dstUpperMatrix; dstBuffer = dstUpperMatrix;
} } else { // we are writing a value to the diagonal
else{ // we are writing a value to the diagonal
dstBlock = dstRow; dstBlock = dstRow;
dstBuffer = dstDiag; dstBuffer = dstDiag;
} }
@ -511,14 +518,16 @@ namespace
// Kernel here is the function object of the cuda kernel // Kernel here is the function object of the cuda kernel
template <class Kernel> template <class Kernel>
inline int getCudaRecomendedThreadBlockSize(Kernel k){ inline int getCudaRecomendedThreadBlockSize(Kernel k)
{
int blockSize; int blockSize;
int tmpGridSize; int tmpGridSize;
cudaOccupancyMaxPotentialBlockSize( &tmpGridSize, &blockSize, k, 0, 0); cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
return blockSize; return blockSize;
} }
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize){ inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
{
return (wantedThreads + threadBlockSize - 1) / threadBlockSize; return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
} }
@ -557,14 +566,14 @@ computeLowerSolveLevelSet(T* reorderedMat,
template <class T, int blocksize> template <class T, int blocksize>
void void
computeLowerSolveLevelSetSplit(T* reorderedMat, computeLowerSolveLevelSetSplit(T* reorderedMat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const T* dInv, const T* dInv,
const T* d, const T* d,
T* v) T* v)
{ {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>); int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
@ -590,13 +599,13 @@ computeUpperSolveLevelSet(T* reorderedMat,
template <class T, int blocksize> template <class T, int blocksize>
void void
computeUpperSolveLevelSetSplit(T* reorderedMat, computeUpperSolveLevelSetSplit(T* reorderedMat,
int* rowIndices, int* rowIndices,
int* colIndices, int* colIndices,
int* indexConversion, int* indexConversion,
int startIdx, int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
const T* dInv, const T* dInv,
T* v) T* v)
{ {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>); int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
@ -633,34 +642,33 @@ computeDiluDiagonal(T* reorderedMat,
template <class T, int blocksize> template <class T, int blocksize>
void void
computeDiluDiagonalSplit(T* reorderedLowerMat, computeDiluDiagonalSplit(T* reorderedLowerMat,
int* lowerRowIndices, int* lowerRowIndices,
int* lowerColIndices, int* lowerColIndices,
T* reorderedUpperMat, T* reorderedUpperMat,
int* upperRowIndices, int* upperRowIndices,
int* upperColIndices, int* upperColIndices,
T* diagonal, T* diagonal,
int* reorderedToNatural, int* reorderedToNatural,
int* naturalToReordered, int* naturalToReordered,
const int startIdx, const int startIdx,
int rowsInLevelSet, int rowsInLevelSet,
T* dInv) T* dInv)
{ {
if (blocksize <= 3) { if (blocksize <= 3) {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>); int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize); int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
cuComputeDiluDiagonalSplit<T, blocksize> cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
<<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat, lowerRowIndices,
lowerRowIndices, lowerColIndices,
lowerColIndices, reorderedUpperMat,
reorderedUpperMat, upperRowIndices,
upperRowIndices, upperColIndices,
upperColIndices, diagonal,
diagonal, reorderedToNatural,
reorderedToNatural, naturalToReordered,
naturalToReordered, startIdx,
startIdx, rowsInLevelSet,
rowsInLevelSet, dInv);
dInv);
} else { } else {
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3"); OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
} }
@ -677,24 +685,41 @@ copyMatDataToReordered(
template <class T, int blocksize> template <class T, int blocksize>
void void
copyMatDataToReorderedSplit( copyMatDataToReorderedSplit(T* srcMatrix,
T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows) int* srcRowIndices,
int* srcColumnIndices,
T* dstLowerMatrix,
int* dstLowerRowIndices,
T* dstUpperMatrix,
int* dstUpperRowIndices,
T* dstDiag,
int* naturalToReordered,
size_t numberOfRows)
{ {
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>); int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>);
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize); int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>( cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
srcMatrix, srcRowIndices, srcColumnIndices, dstLowerMatrix, dstLowerRowIndices, dstUpperMatrix, dstUpperRowIndices, dstDiag, naturalToReordered, numberOfRows); srcRowIndices,
srcColumnIndices,
dstLowerMatrix,
dstLowerRowIndices,
dstUpperMatrix,
dstUpperRowIndices,
dstDiag,
naturalToReordered,
numberOfRows);
} }
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \ #define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \ template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t); \ template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t); \
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t); \ template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t); \
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*); \ template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*); \
template void computeDiluDiagonalSplit<T, blocksize>(T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*);\ template void computeDiluDiagonalSplit<T, blocksize>( \
template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \ T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*); \
template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*); \ template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \
template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \ template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*); \
template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \
template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*); template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*);
INSTANTIATE_KERNEL_WRAPPERS(float, 1); INSTANTIATE_KERNEL_WRAPPERS(float, 1);

View File

@ -24,12 +24,12 @@
#include <dune/common/fmatrix.hh> #include <dune/common/fmatrix.hh>
#include <dune/istl/bcrsmatrix.hh> #include <dune/istl/bcrsmatrix.hh>
#include <memory> #include <memory>
#include <opm/simulators/linalg/DILU.hpp>
#include <opm/simulators/linalg/cuistl/CuDILU.hpp> #include <opm/simulators/linalg/cuistl/CuDILU.hpp>
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp> #include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
#include <opm/simulators/linalg/cuistl/CuVector.hpp> #include <opm/simulators/linalg/cuistl/CuVector.hpp>
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp> #include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp> #include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
#include <opm/simulators/linalg/DILU.hpp>
#include <random> #include <random>
#include <vector> #include <vector>
@ -44,47 +44,49 @@ using Sp2x2BlockMatrix = Dune::BCRSMatrix<FM2x2>;
using CuMatrix = Opm::cuistl::CuSparseMatrix<T>; using CuMatrix = Opm::cuistl::CuSparseMatrix<T>;
using CuIntVec = Opm::cuistl::CuVector<int>; using CuIntVec = Opm::cuistl::CuVector<int>;
using CuFloatingPointVec = Opm::cuistl::CuVector<T>; using CuFloatingPointVec = Opm::cuistl::CuVector<T>;
using CuDilu1x1 = Opm::cuistl::CuDILU<Sp1x1BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>; using CuDilu1x1 = Opm::cuistl::CuDILU<Sp1x1BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
using CuDilu2x2 = Opm::cuistl::CuDILU<Sp2x2BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>; using CuDilu2x2 = Opm::cuistl::CuDILU<Sp2x2BlockMatrix, CuFloatingPointVec, CuFloatingPointVec>;
Sp1x1BlockMatrix get1x1BlockTestMatrix(){ Sp1x1BlockMatrix
/* get1x1BlockTestMatrix()
matA: {
1 2 0 3 0 0 /*
4 5 0 6 0 7 matA:
0 0 8 0 0 0 1 2 0 3 0 0
9 10 0 11 12 0 4 5 0 6 0 7
0 0 0 13 14 0 0 0 8 0 0 0
0 15 0 0 0 16 9 10 0 11 12 0
0 0 0 13 14 0
0 15 0 0 0 16
Expected reordering: Expected reordering:
1 2 0 3 0 0 1 2 0 3 0 0
0 0 8 0 0 0 0 0 8 0 0 0
4 5 0 6 0 7 4 5 0 6 0 7
9 10 0 11 12 0 9 10 0 11 12 0
0 15 0 0 0 16 0 15 0 0 0 16
0 0 0 13 14 0 0 0 0 13 14 0
Expected lowerTriangularReorderedMatrix: Expected lowerTriangularReorderedMatrix:
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 4 0 0 0 0 0
9 10 0 0 0 0 9 10 0 0 0 0
0 15 0 0 0 0 0 15 0 0 0 0
0 0 0 13 0 0 0 0 0 13 0 0
Expected lowerTriangularReorderedMatrix: Expected lowerTriangularReorderedMatrix:
0 2 0 3 0 0 0 2 0 3 0 0
0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 6 0 7 0 0 0 6 0 7
0 0 0 0 12 0 0 0 0 0 12 0
0 0 0 0 0 0 0 0 0 0 0 0
*/ */
const int N = 6; const int N = 6;
const int nonZeroes = 16; const int nonZeroes = 16;
//Create the Dune A matrix // Create the Dune A matrix
Sp1x1BlockMatrix matA(N, N, nonZeroes, Sp1x1BlockMatrix::row_wise); Sp1x1BlockMatrix matA(N, N, nonZeroes, Sp1x1BlockMatrix::row_wise);
for (auto row = matA.createbegin(); row != matA.createend(); ++row) { for (auto row = matA.createbegin(); row != matA.createend(); ++row) {
row.insert(row.index()); row.insert(row.index());
@ -132,7 +134,9 @@ Sp1x1BlockMatrix get1x1BlockTestMatrix(){
return matA; return matA;
} }
Sp2x2BlockMatrix get2x2BlockTestMatrix(){ Sp2x2BlockMatrix
get2x2BlockTestMatrix()
{
/* /*
matA: matA:
1 2 0 3 0 0 1 2 0 3 0 0
@ -148,7 +152,7 @@ Sp2x2BlockMatrix get2x2BlockTestMatrix(){
const int N = 3; const int N = 3;
const int nonZeroes = 9; const int nonZeroes = 9;
//Create the Dune A matrix // Create the Dune A matrix
Sp2x2BlockMatrix matA(N, N, nonZeroes, Sp2x2BlockMatrix::row_wise); Sp2x2BlockMatrix matA(N, N, nonZeroes, Sp2x2BlockMatrix::row_wise);
for (auto row = matA.createbegin(); row != matA.createend(); ++row) { for (auto row = matA.createbegin(); row != matA.createend(); ++row) {
row.insert(row.index()); row.insert(row.index());
@ -215,13 +219,13 @@ BOOST_AUTO_TEST_CASE(TestDiluApply)
// put results in std::vector // put results in std::vector
std::vector<T> cpudilures; std::vector<T> cpudilures;
for (auto e : h_output){ for (auto e : h_output) {
cpudilures.push_back(e); cpudilures.push_back(e);
} }
auto cudilures = d_output.asStdVector(); auto cudilures = d_output.asStdVector();
// check that CuDilu results matches that of CPU dilu // check that CuDilu results matches that of CPU dilu
for (size_t i = 0; i < cudilures.size(); ++i){ for (size_t i = 0; i < cudilures.size(); ++i) {
BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7); BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
} }
} }
@ -255,14 +259,14 @@ BOOST_AUTO_TEST_CASE(TestDiluApplyBlocked)
auto cudilures = d_output.asStdVector(); auto cudilures = d_output.asStdVector();
std::vector<T> cpudilures; std::vector<T> cpudilures;
for (auto v : h_output){ for (auto v : h_output) {
for (auto e : v){ for (auto e : v) {
cpudilures.push_back(e); cpudilures.push_back(e);
} }
} }
// check that the values are close // check that the values are close
for (size_t i = 0; i < cudilures.size(); ++i){ for (size_t i = 0; i < cudilures.size(); ++i) {
BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7); BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
} }
} }
@ -316,13 +320,13 @@ BOOST_AUTO_TEST_CASE(TestDiluInitAndUpdateLarge)
// put results in std::vector // put results in std::vector
std::vector<T> cpudilures; std::vector<T> cpudilures;
for (auto e : h_output){ for (auto e : h_output) {
cpudilures.push_back(e); cpudilures.push_back(e);
} }
auto cudilures = d_output.asStdVector(); auto cudilures = d_output.asStdVector();
// check that CuDilu results matches that of CPU dilu // check that CuDilu results matches that of CPU dilu
for (size_t i = 0; i < cudilures.size(); ++i){ for (size_t i = 0; i < cudilures.size(); ++i) {
BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7); BOOST_CHECK_CLOSE(cudilures[i], cpudilures[i], 1e-7);
} }
} }