Fuse kernels to optimize performance and update

the tests and documentation.
This commit is contained in:
Tobias Meyer Andersen
2023-10-12 16:06:21 +02:00
parent fbeb006d45
commit c809819cbd
6 changed files with 77 additions and 64 deletions

View File

@@ -83,24 +83,18 @@ CuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
template <class M, class X, class Y, int l>
void
CuJac<M, X, Y, l>::apply(X& v, const Y& d)
{
{
// Jacobi preconditioner: x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
// Working with defect d and update v it we only need to set v = w*(D^-1)*d
const auto numberOfRows = detail::to_int(cuMatrix.N());
const auto numberOfNonzeroBlocks = detail::to_int(cuMatrix.nonzeroes());
const auto blockSize = detail::to_int(cuMatrix.blockSize());
// Could probably be further optimized a little bit by fusing the copy with the
// following multiplication. To do so add a kernel that does not do the multiplication
// in place.
v = d; // cuda copy
detail::blockVectorMultiplicationAtAllIndices(cuVec_diagInvFlattened.data(),
detail::to_size_t(numberOfRows),
detail::to_size_t(blockSize),
v.data());
v *= relaxation_factor; // cuBlas axpy
// Compute the MV product where the matrix is diagonal and therefore stored as a vector.
// The product is thus computed as a hadamard product.
detail::weightedDiagMV(cuVec_diagInvFlattened.data(),
detail::to_size_t(cuMatrix.N()),
detail::to_size_t(cuMatrix.blockSize()),
relaxation_factor,
d.data(),
v.data());
}
template <class M, class X, class Y, int l>

View File

@@ -30,10 +30,7 @@
namespace Opm::cuistl
{
//! \brief Sequential Jacobi preconditioner on the GPU through the CuSparse library.
//!
//! This implementation calls the CuSparse functions, which in turn essentially
//! does a level decomposition to get some parallelism.
//! \brief Sequential Jacobi preconditioner on the GPU.
//!
//! \note This is not expected to be a fast preconditioner.
//!
@@ -50,7 +47,7 @@ class CuJac : public Dune::PreconditionerWithUpdate<X, Y>
{
public:
//! \brief The matrix type the preconditioner is for.
using matrix_type = typename std::remove_const<M>::type ;
using matrix_type = typename std::remove_const<M>::type;
//! \brief The domain type of the preconditioner.
using domain_type = X;
//! \brief The range type of the preconditioner.
@@ -101,7 +98,7 @@ private:
//! \brief Reference to the underlying matrix
const M& cpuMatrix;
//! \brief The relaxation factor to use.
field_type relaxation_factor;
const field_type relaxation_factor;
//! \brief The A matrix stored on the gpu
CuSparseMatrix<field_type> cuMatrix;
//! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector

View File

@@ -48,50 +48,56 @@ namespace
}
template <class T>
__global__ void elementWiseMultiplyMVKernelBlocksize1(T* squareBlockVector, const size_t numberOfElements, T* vec)
__global__ void weightedDiagMVBlocksize1(
T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
const int blocksize = 1;
if (globalIndex < numberOfElements) {
T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
T* pVec = (vec + (blocksize * globalIndex));
const T* srcVec = (src_vec + (blocksize * globalIndex));
T* dstVec = (dst_vec + (blocksize * globalIndex));
T v0 = pVec[0];
pVec[0] = pMat[0] * v0;
T v0 = srcVec[0];
dstVec[0] = (pMat[0] * v0) * w;
}
}
template <class T>
__global__ void elementWiseMultiplyMVKernelBlocksize2(T* squareBlockVector, const size_t numberOfElements, T* vec)
__global__ void weightedDiagMVBlocksize2(
T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
const int blocksize = 2;
if (globalIndex < numberOfElements) {
T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
T* pVec = (vec + (blocksize * globalIndex));
const T* srcVec = (src_vec + (blocksize * globalIndex));
T* dstVec = (dst_vec + (blocksize * globalIndex));
T v0 = pVec[0], v1 = pVec[1];
pVec[0] = pMat[0] * v0 + pMat[1] * v1;
pVec[1] = pMat[2] * v0 + pMat[3] * v1;
T v0 = srcVec[0], v1 = srcVec[1];
dstVec[0] = (pMat[0] * v0 + pMat[1] * v1) * w;
dstVec[1] = (pMat[2] * v0 + pMat[3] * v1) * w;
}
}
template <class T>
__global__ void elementWiseMultiplyMVKernelBlocksize3(T* squareBlockVector, const size_t numberOfElements, T* vec)
__global__ void weightedDiagMVBlocksize3(
T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
{
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
const int blocksize = 3;
if (globalIndex < numberOfElements) {
T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
T* pVec = (vec + (blocksize * globalIndex));
const T* srcVec = (src_vec + (blocksize * globalIndex));
T* dstVec = (dst_vec + (blocksize * globalIndex));
T v0 = pVec[0], v1 = pVec[1], v2 = pVec[2];
pVec[0] = pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2;
pVec[1] = pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2;
pVec[2] = pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2;
T v0 = srcVec[0], v1 = srcVec[1], v2 = srcVec[2];
dstVec[0] = (pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2) * w;
dstVec[1] = (pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2) * w;
dstVec[2] = (pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2) * w;
}
}
@@ -162,23 +168,25 @@ template int innerProductAtIndices(const int*, const int*, int* buffer, size_t,
template <class T>
void
blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
const size_t numberOfElements,
const size_t blocksize,
T* vec)
weightedDiagMV(T* squareBlockVector,
const size_t numberOfElements,
const size_t blocksize,
T relaxation_factor,
const T* src_vec,
T* dst_vec)
{
switch (blocksize) {
case 1:
elementWiseMultiplyMVKernelBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, vec);
weightedDiagMVBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
break;
case 2:
elementWiseMultiplyMVKernelBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, vec);
weightedDiagMVBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
break;
case 3:
elementWiseMultiplyMVKernelBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, vec);
weightedDiagMVBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
break;
default:
OPM_THROW(std::invalid_argument, "blockvector hadamard product not defined for blocksize>3");
@@ -186,7 +194,7 @@ blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
}
}
template void blockVectorMultiplicationAtAllIndices(double*, const size_t, const size_t, double*);
template void blockVectorMultiplicationAtAllIndices(float*, const size_t, const size_t, float*);
template void weightedDiagMV(double*, const size_t, const size_t, double, const double*, double*);
template void weightedDiagMV(float*, const size_t, const size_t, float, const float*, float*);
} // namespace Opm::cuistl::detail

View File

@@ -56,15 +56,23 @@ template <class T>
T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices);
/**
* @brief Compute the hadamard product of two vectors element by element, where the first vector containts square blocks, while the second vector containts vectors
* @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
* compute the hadamard product.
* @param squareBlockVector A CuVector whose elements are NxN matrix blocks
* @param numberOfRows The number of rows in the vector
* @param blocksize The sidelength of the square block elements in the vector
* @param vec A pointer to the data of the CuVector we multiply the blockvector with, the result is stored in vec
*
* @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the diagonal of the matrix and use this product.
* @param src_vec A pointer to the data of the CuVector we multiply the blockvector with
* @param dst_vec A pointer to the data of the CuVector we store the result in
*
* @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the
* diagonal of the matrix and use this product.
*/
template <class T>
void blockVectorMultiplicationAtAllIndices(T* squareBlockVector, const size_t numberOfRows, const size_t blocksize, T* vec);
void weightedDiagMV(T* squareBlockVector,
const size_t numberOfRows,
const size_t blocksize,
T relaxation_factor,
const T* src_vec,
T* dst_vec);
} // namespace Opm::cuistl::detail
#endif

View File

@@ -42,16 +42,19 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVecto
const size_t blocksize = 3;
const size_t N = 1;
const T weight = 1.0;
std::vector<T> h_blockVector({1.0,2.0,3.0,5.0,2.0,3.0,2.0,1.0,2.0});
std::vector<T> h_vecVector({3.0,2.0,1.0});
std::vector<T> h_dstVector({0, 0, 0});
Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);
Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());
std::vector<T> expected_vec{10.0,22.0,10.0};
std::vector<T> computed_vec = d_vecVector.asStdVector();
std::vector<T> computed_vec = d_dstVector.asStdVector();
BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
for (size_t i = 0; i < expected_vec.size(); i++){
@@ -63,25 +66,28 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
{
/*
Example in the test for multiplying by element a blockvector with a vector of vectors
| |1 2| | | |1| | | | 7| |
| |3 4| | X | |3| | = | |15| |
| | | | | |
| |4 3| | | |2| | | |20| |
| |2 1| | | |4| | | | 8| |
| |1 2| | | |1| | | | 3.5| |
0.5 * | |3 4| | X | |3| | = | | 7.5| |
| | | | | |
| |4 3| | | |2| | | |10.0| |
| |2 1| | | |4| | | | 4.0| |
*/
const size_t blocksize = 2;
const size_t N = 2;
const T weight = 0.5;
std::vector<T> h_blockVector({1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0});
std::vector<T> h_vecVector({1.0,3.0,2.0,4.0});
std::vector<T> h_dstVector({0, 0, 0, 0});
Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);
Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());
std::vector<T> expected_vec{7.0,15.0,20.0,8.0};
std::vector<T> computed_vec = d_vecVector.asStdVector();
std::vector<T> expected_vec{3.5,7.5,10.0,4.0};
std::vector<T> computed_vec = d_dstVector.asStdVector();
BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
for (size_t i = 0; i < expected_vec.size(); i++){

View File

@@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
h_dune_b[1][0] = 3.0;
h_dune_b[1][1] = 4.0;
const T expected_ans[2][2] = {{1.0, 0.0},{-1.0,-3.0/2.0}};
const T expected_ans[2][2] = {{0.0, 5.0/2.0},{-3.0/2.0, -2.0}};
CuJac.apply(h_dune_x, h_dune_b);
BOOST_CHECK_CLOSE(h_dune_x[0][0], expected_ans[0][0], 1e-7);