mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Fuse kernels to optimize performance and update
the tests and documentation.
This commit is contained in:
@@ -83,24 +83,18 @@ CuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuJac<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
{
|
||||
// Jacobi preconditioner: x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
|
||||
// Working with defect d and update v it we only need to set v = w*(D^-1)*d
|
||||
|
||||
const auto numberOfRows = detail::to_int(cuMatrix.N());
|
||||
const auto numberOfNonzeroBlocks = detail::to_int(cuMatrix.nonzeroes());
|
||||
const auto blockSize = detail::to_int(cuMatrix.blockSize());
|
||||
|
||||
// Could probably be further optimized a little bit by fusing the copy with the
|
||||
// following multiplication. To do so add a kernel that does not do the multiplication
|
||||
// in place.
|
||||
|
||||
v = d; // cuda copy
|
||||
detail::blockVectorMultiplicationAtAllIndices(cuVec_diagInvFlattened.data(),
|
||||
detail::to_size_t(numberOfRows),
|
||||
detail::to_size_t(blockSize),
|
||||
v.data());
|
||||
v *= relaxation_factor; // cuBlas axpy
|
||||
// Compute the MV product where the matrix is diagonal and therefore stored as a vector.
|
||||
// The product is thus computed as a hadamard product.
|
||||
detail::weightedDiagMV(cuVec_diagInvFlattened.data(),
|
||||
detail::to_size_t(cuMatrix.N()),
|
||||
detail::to_size_t(cuMatrix.blockSize()),
|
||||
relaxation_factor,
|
||||
d.data(),
|
||||
v.data());
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
|
||||
@@ -30,10 +30,7 @@
|
||||
|
||||
namespace Opm::cuistl
|
||||
{
|
||||
//! \brief Sequential Jacobi preconditioner on the GPU through the CuSparse library.
|
||||
//!
|
||||
//! This implementation calls the CuSparse functions, which in turn essentially
|
||||
//! does a level decomposition to get some parallelism.
|
||||
//! \brief Sequential Jacobi preconditioner on the GPU.
|
||||
//!
|
||||
//! \note This is not expected to be a fast preconditioner.
|
||||
//!
|
||||
@@ -50,7 +47,7 @@ class CuJac : public Dune::PreconditionerWithUpdate<X, Y>
|
||||
{
|
||||
public:
|
||||
//! \brief The matrix type the preconditioner is for.
|
||||
using matrix_type = typename std::remove_const<M>::type ;
|
||||
using matrix_type = typename std::remove_const<M>::type;
|
||||
//! \brief The domain type of the preconditioner.
|
||||
using domain_type = X;
|
||||
//! \brief The range type of the preconditioner.
|
||||
@@ -101,7 +98,7 @@ private:
|
||||
//! \brief Reference to the underlying matrix
|
||||
const M& cpuMatrix;
|
||||
//! \brief The relaxation factor to use.
|
||||
field_type relaxation_factor;
|
||||
const field_type relaxation_factor;
|
||||
//! \brief The A matrix stored on the gpu
|
||||
CuSparseMatrix<field_type> cuMatrix;
|
||||
//! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
|
||||
|
||||
@@ -48,50 +48,56 @@ namespace
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void elementWiseMultiplyMVKernelBlocksize1(T* squareBlockVector, const size_t numberOfElements, T* vec)
|
||||
__global__ void weightedDiagMVBlocksize1(
|
||||
T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
|
||||
{
|
||||
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int blocksize = 1;
|
||||
|
||||
if (globalIndex < numberOfElements) {
|
||||
T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
|
||||
T* pVec = (vec + (blocksize * globalIndex));
|
||||
const T* srcVec = (src_vec + (blocksize * globalIndex));
|
||||
T* dstVec = (dst_vec + (blocksize * globalIndex));
|
||||
|
||||
T v0 = pVec[0];
|
||||
pVec[0] = pMat[0] * v0;
|
||||
T v0 = srcVec[0];
|
||||
dstVec[0] = (pMat[0] * v0) * w;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void elementWiseMultiplyMVKernelBlocksize2(T* squareBlockVector, const size_t numberOfElements, T* vec)
|
||||
__global__ void weightedDiagMVBlocksize2(
|
||||
T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
|
||||
{
|
||||
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int blocksize = 2;
|
||||
|
||||
if (globalIndex < numberOfElements) {
|
||||
T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
|
||||
T* pVec = (vec + (blocksize * globalIndex));
|
||||
const T* srcVec = (src_vec + (blocksize * globalIndex));
|
||||
T* dstVec = (dst_vec + (blocksize * globalIndex));
|
||||
|
||||
T v0 = pVec[0], v1 = pVec[1];
|
||||
pVec[0] = pMat[0] * v0 + pMat[1] * v1;
|
||||
pVec[1] = pMat[2] * v0 + pMat[3] * v1;
|
||||
T v0 = srcVec[0], v1 = srcVec[1];
|
||||
dstVec[0] = (pMat[0] * v0 + pMat[1] * v1) * w;
|
||||
dstVec[1] = (pMat[2] * v0 + pMat[3] * v1) * w;
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void elementWiseMultiplyMVKernelBlocksize3(T* squareBlockVector, const size_t numberOfElements, T* vec)
|
||||
__global__ void weightedDiagMVBlocksize3(
|
||||
T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
|
||||
{
|
||||
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int blocksize = 3;
|
||||
|
||||
if (globalIndex < numberOfElements) {
|
||||
T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
|
||||
T* pVec = (vec + (blocksize * globalIndex));
|
||||
const T* srcVec = (src_vec + (blocksize * globalIndex));
|
||||
T* dstVec = (dst_vec + (blocksize * globalIndex));
|
||||
|
||||
T v0 = pVec[0], v1 = pVec[1], v2 = pVec[2];
|
||||
pVec[0] = pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2;
|
||||
pVec[1] = pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2;
|
||||
pVec[2] = pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2;
|
||||
T v0 = srcVec[0], v1 = srcVec[1], v2 = srcVec[2];
|
||||
dstVec[0] = (pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2) * w;
|
||||
dstVec[1] = (pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2) * w;
|
||||
dstVec[2] = (pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2) * w;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,23 +168,25 @@ template int innerProductAtIndices(const int*, const int*, int* buffer, size_t,
|
||||
|
||||
template <class T>
|
||||
void
|
||||
blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
|
||||
const size_t numberOfElements,
|
||||
const size_t blocksize,
|
||||
T* vec)
|
||||
weightedDiagMV(T* squareBlockVector,
|
||||
const size_t numberOfElements,
|
||||
const size_t blocksize,
|
||||
T relaxation_factor,
|
||||
const T* src_vec,
|
||||
T* dst_vec)
|
||||
{
|
||||
switch (blocksize) {
|
||||
case 1:
|
||||
elementWiseMultiplyMVKernelBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||
squareBlockVector, numberOfElements, vec);
|
||||
weightedDiagMVBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||
squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
|
||||
break;
|
||||
case 2:
|
||||
elementWiseMultiplyMVKernelBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||
squareBlockVector, numberOfElements, vec);
|
||||
weightedDiagMVBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||
squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
|
||||
break;
|
||||
case 3:
|
||||
elementWiseMultiplyMVKernelBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||
squareBlockVector, numberOfElements, vec);
|
||||
weightedDiagMVBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
||||
squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
|
||||
break;
|
||||
default:
|
||||
OPM_THROW(std::invalid_argument, "blockvector hadamard product not defined for blocksize>3");
|
||||
@@ -186,7 +194,7 @@ blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
|
||||
}
|
||||
}
|
||||
|
||||
template void blockVectorMultiplicationAtAllIndices(double*, const size_t, const size_t, double*);
|
||||
template void blockVectorMultiplicationAtAllIndices(float*, const size_t, const size_t, float*);
|
||||
template void weightedDiagMV(double*, const size_t, const size_t, double, const double*, double*);
|
||||
template void weightedDiagMV(float*, const size_t, const size_t, float, const float*, float*);
|
||||
|
||||
} // namespace Opm::cuistl::detail
|
||||
|
||||
@@ -56,15 +56,23 @@ template <class T>
|
||||
T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices);
|
||||
|
||||
/**
|
||||
* @brief Compute the hadamard product of two vectors element by element, where the first vector containts square blocks, while the second vector containts vectors
|
||||
* @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
|
||||
* compute the hadamard product.
|
||||
* @param squareBlockVector A CuVector whose elements are NxN matrix blocks
|
||||
* @param numberOfRows The number of rows in the vector
|
||||
* @param blocksize The sidelength of the square block elements in the vector
|
||||
* @param vec A pointer to the data of the CuVector we multiply the blockvector with, the result is stored in vec
|
||||
*
|
||||
* @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the diagonal of the matrix and use this product.
|
||||
* @param src_vec A pointer to the data of the CuVector we multiply the blockvector with
|
||||
* @param dst_vec A pointer to the data of the CuVector we store the result in
|
||||
*
|
||||
* @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the
|
||||
* diagonal of the matrix and use this product.
|
||||
*/
|
||||
template <class T>
|
||||
void blockVectorMultiplicationAtAllIndices(T* squareBlockVector, const size_t numberOfRows, const size_t blocksize, T* vec);
|
||||
void weightedDiagMV(T* squareBlockVector,
|
||||
const size_t numberOfRows,
|
||||
const size_t blocksize,
|
||||
T relaxation_factor,
|
||||
const T* src_vec,
|
||||
T* dst_vec);
|
||||
} // namespace Opm::cuistl::detail
|
||||
#endif
|
||||
|
||||
@@ -42,16 +42,19 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVecto
|
||||
|
||||
const size_t blocksize = 3;
|
||||
const size_t N = 1;
|
||||
const T weight = 1.0;
|
||||
|
||||
std::vector<T> h_blockVector({1.0,2.0,3.0,5.0,2.0,3.0,2.0,1.0,2.0});
|
||||
std::vector<T> h_vecVector({3.0,2.0,1.0});
|
||||
std::vector<T> h_dstVector({0, 0, 0});
|
||||
Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
|
||||
Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
|
||||
Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);
|
||||
|
||||
Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
|
||||
Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());
|
||||
|
||||
std::vector<T> expected_vec{10.0,22.0,10.0};
|
||||
std::vector<T> computed_vec = d_vecVector.asStdVector();
|
||||
std::vector<T> computed_vec = d_dstVector.asStdVector();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
|
||||
for (size_t i = 0; i < expected_vec.size(); i++){
|
||||
@@ -63,25 +66,28 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
|
||||
{
|
||||
/*
|
||||
Example in the test for multiplying by element a blockvector with a vector of vectors
|
||||
| |1 2| | | |1| | | | 7| |
|
||||
| |3 4| | X | |3| | = | |15| |
|
||||
| | | | | |
|
||||
| |4 3| | | |2| | | |20| |
|
||||
| |2 1| | | |4| | | | 8| |
|
||||
| |1 2| | | |1| | | | 3.5| |
|
||||
0.5 * | |3 4| | X | |3| | = | | 7.5| |
|
||||
| | | | | |
|
||||
| |4 3| | | |2| | | |10.0| |
|
||||
| |2 1| | | |4| | | | 4.0| |
|
||||
*/
|
||||
|
||||
const size_t blocksize = 2;
|
||||
const size_t N = 2;
|
||||
const T weight = 0.5;
|
||||
|
||||
std::vector<T> h_blockVector({1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0});
|
||||
std::vector<T> h_vecVector({1.0,3.0,2.0,4.0});
|
||||
std::vector<T> h_dstVector({0, 0, 0, 0});
|
||||
Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
|
||||
Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
|
||||
Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);
|
||||
|
||||
Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
|
||||
Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());
|
||||
|
||||
std::vector<T> expected_vec{7.0,15.0,20.0,8.0};
|
||||
std::vector<T> computed_vec = d_vecVector.asStdVector();
|
||||
std::vector<T> expected_vec{3.5,7.5,10.0,4.0};
|
||||
std::vector<T> computed_vec = d_dstVector.asStdVector();
|
||||
|
||||
BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
|
||||
for (size_t i = 0; i < expected_vec.size(); i++){
|
||||
|
||||
@@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
|
||||
h_dune_b[1][0] = 3.0;
|
||||
h_dune_b[1][1] = 4.0;
|
||||
|
||||
const T expected_ans[2][2] = {{1.0, 0.0},{-1.0,-3.0/2.0}};
|
||||
const T expected_ans[2][2] = {{0.0, 5.0/2.0},{-3.0/2.0, -2.0}};
|
||||
|
||||
CuJac.apply(h_dune_x, h_dune_b);
|
||||
BOOST_CHECK_CLOSE(h_dune_x[0][0], expected_ans[0][0], 1e-7);
|
||||
|
||||
Reference in New Issue
Block a user