Fuse kernels to optimize performance and update

the tests and documentation.
2025-02-25 18:55:30 -06:00 · 2023-10-12 16:06:21 +02:00
parent fbeb006d45
commit c809819cbd
6 changed files with 77 additions and 64 deletions
--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@@ -83,24 +83,18 @@ CuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
 template <class M, class X, class Y, int l>
 void
 CuJac<M, X, Y, l>::apply(X& v, const Y& d)
-{   
+{
    // Jacobi preconditioner: x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
    // Working with defect d and update v it we only need to set v = w*(D^-1)*d

-    const auto numberOfRows = detail::to_int(cuMatrix.N());
-    const auto numberOfNonzeroBlocks = detail::to_int(cuMatrix.nonzeroes());
-    const auto blockSize = detail::to_int(cuMatrix.blockSize());
-
-    // Could probably be further optimized a little bit by fusing the copy with the 
-    // following multiplication. To do so add a kernel that does not do the multiplication
-    // in place.
-
-    v = d; // cuda copy
-    detail::blockVectorMultiplicationAtAllIndices(cuVec_diagInvFlattened.data(),
-                                                  detail::to_size_t(numberOfRows),
-                                                  detail::to_size_t(blockSize),
-                                                  v.data());
-    v *= relaxation_factor; // cuBlas axpy
+    // Compute the MV product where the matrix is diagonal and therefore stored as a vector.
+    // The product is thus computed as a hadamard product.
+    detail::weightedDiagMV(cuVec_diagInvFlattened.data(),
+                           detail::to_size_t(cuMatrix.N()),
+                           detail::to_size_t(cuMatrix.blockSize()),
+                           relaxation_factor,
+                           d.data(),
+                           v.data());
 }

 template <class M, class X, class Y, int l>
--- a/opm/simulators/linalg/cuistl/CuJac.hpp
+++ b/opm/simulators/linalg/cuistl/CuJac.hpp
@@ -30,10 +30,7 @@

 namespace Opm::cuistl
 {
-//! \brief Sequential Jacobi preconditioner on the GPU through the CuSparse library.
-//!
-//! This implementation calls the CuSparse functions, which in turn essentially
-//! does a level decomposition to get some parallelism.
+//! \brief Sequential Jacobi preconditioner on the GPU.
 //!
 //! \note This is not expected to be a fast preconditioner.
 //!
@@ -50,7 +47,7 @@ class CuJac : public Dune::PreconditionerWithUpdate<X, Y>
 {
 public:
    //! \brief The matrix type the preconditioner is for.
-    using matrix_type = typename std::remove_const<M>::type ;
+    using matrix_type = typename std::remove_const<M>::type;
    //! \brief The domain type of the preconditioner.
    using domain_type = X;
    //! \brief The range type of the preconditioner.
@@ -101,7 +98,7 @@ private:
    //! \brief Reference to the underlying matrix
    const M& cpuMatrix;
    //! \brief The relaxation factor to use.
-    field_type relaxation_factor;
+    const field_type relaxation_factor;
    //! \brief The A matrix stored on the gpu
    CuSparseMatrix<field_type> cuMatrix;
    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.cu
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.cu
@@ -48,50 +48,56 @@ namespace
    }

    template <class T>
-    __global__ void elementWiseMultiplyMVKernelBlocksize1(T* squareBlockVector, const size_t numberOfElements, T* vec)
+    __global__ void weightedDiagMVBlocksize1(
+        T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
        const int blocksize = 1;

        if (globalIndex < numberOfElements) {
            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
-            T* pVec = (vec + (blocksize * globalIndex));
+            const T* srcVec = (src_vec + (blocksize * globalIndex));
+            T* dstVec = (dst_vec + (blocksize * globalIndex));

-            T v0 = pVec[0];
-            pVec[0] = pMat[0] * v0;
+            T v0 = srcVec[0];
+            dstVec[0] = (pMat[0] * v0) * w;
        }
    }

    template <class T>
-    __global__ void elementWiseMultiplyMVKernelBlocksize2(T* squareBlockVector, const size_t numberOfElements, T* vec)
+    __global__ void weightedDiagMVBlocksize2(
+        T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
        const int blocksize = 2;

        if (globalIndex < numberOfElements) {
            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
-            T* pVec = (vec + (blocksize * globalIndex));
+            const T* srcVec = (src_vec + (blocksize * globalIndex));
+            T* dstVec = (dst_vec + (blocksize * globalIndex));

-            T v0 = pVec[0], v1 = pVec[1];
-            pVec[0] = pMat[0] * v0 + pMat[1] * v1;
-            pVec[1] = pMat[2] * v0 + pMat[3] * v1;
+            T v0 = srcVec[0], v1 = srcVec[1];
+            dstVec[0] = (pMat[0] * v0 + pMat[1] * v1) * w;
+            dstVec[1] = (pMat[2] * v0 + pMat[3] * v1) * w;
        }
    }

    template <class T>
-    __global__ void elementWiseMultiplyMVKernelBlocksize3(T* squareBlockVector, const size_t numberOfElements, T* vec)
+    __global__ void weightedDiagMVBlocksize3(
+        T* squareBlockVector, const size_t numberOfElements, const T w, const T* src_vec, T* dst_vec)
    {
        const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
        const int blocksize = 3;

        if (globalIndex < numberOfElements) {
            T* pMat = (squareBlockVector + (blocksize * blocksize * globalIndex));
-            T* pVec = (vec + (blocksize * globalIndex));
+            const T* srcVec = (src_vec + (blocksize * globalIndex));
+            T* dstVec = (dst_vec + (blocksize * globalIndex));

-            T v0 = pVec[0], v1 = pVec[1], v2 = pVec[2];
-            pVec[0] = pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2;
-            pVec[1] = pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2;
-            pVec[2] = pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2;
+            T v0 = srcVec[0], v1 = srcVec[1], v2 = srcVec[2];
+            dstVec[0] = (pMat[0] * v0 + pMat[1] * v1 + pMat[2] * v2) * w;
+            dstVec[1] = (pMat[3] * v0 + pMat[4] * v1 + pMat[5] * v2) * w;
+            dstVec[2] = (pMat[6] * v0 + pMat[7] * v1 + pMat[8] * v2) * w;
        }
    }

@@ -162,23 +168,25 @@ template int innerProductAtIndices(const int*, const int*, int* buffer, size_t,

 template <class T>
 void
-blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
-                                      const size_t numberOfElements,
-                                      const size_t blocksize,
-                                      T* vec)
+weightedDiagMV(T* squareBlockVector,
+               const size_t numberOfElements,
+               const size_t blocksize,
+               T relaxation_factor,
+               const T* src_vec,
+               T* dst_vec)
 {
    switch (blocksize) {
    case 1:
-        elementWiseMultiplyMVKernelBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-            squareBlockVector, numberOfElements, vec);
+        weightedDiagMVBlocksize1<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
        break;
    case 2:
-        elementWiseMultiplyMVKernelBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-            squareBlockVector, numberOfElements, vec);
+        weightedDiagMVBlocksize2<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
        break;
    case 3:
-        elementWiseMultiplyMVKernelBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
-            squareBlockVector, numberOfElements, vec);
+        weightedDiagMVBlocksize3<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
+            squareBlockVector, numberOfElements, relaxation_factor, src_vec, dst_vec);
        break;
    default:
        OPM_THROW(std::invalid_argument, "blockvector hadamard product not defined for blocksize>3");
@@ -186,7 +194,7 @@ blockVectorMultiplicationAtAllIndices(T* squareBlockVector,
    }
 }

-template void blockVectorMultiplicationAtAllIndices(double*, const size_t, const size_t, double*);
-template void blockVectorMultiplicationAtAllIndices(float*, const size_t, const size_t, float*);
+template void weightedDiagMV(double*, const size_t, const size_t, double, const double*, double*);
+template void weightedDiagMV(float*, const size_t, const size_t, float, const float*, float*);

 } // namespace Opm::cuistl::detail
--- a/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
+++ b/opm/simulators/linalg/cuistl/detail/vector_operations.hpp
@@ -56,15 +56,23 @@ template <class T>
 T innerProductAtIndices(const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices);

 /**
- * @brief Compute the hadamard product of two vectors element by element, where the first vector containts square blocks, while the second vector containts vectors
+ * @brief Compue the weighted matrix vector product where the matrix is diagonal, the diagonal is a vector, meaning we
+ * compute the hadamard product.
 * @param squareBlockVector A CuVector whose elements are NxN matrix blocks
 * @param numberOfRows The number of rows in the vector
 * @param blocksize The sidelength of the square block elements in the vector
- * @param vec A pointer to the data of the CuVector we multiply the blockvector with, the result is stored in vec
- * 
- * @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the diagonal of the matrix and use this product.
+ * @param src_vec A pointer to the data of the CuVector we multiply the blockvector with
+ * @param dst_vec A pointer to the data of the CuVector we store the result in
+ *
+ * @note This is implemented as a faster way to multiply a diagonal matrix with a blockvector. We need only store the
+ * diagonal of the matrix and use this product.
 */
 template <class T>
-void blockVectorMultiplicationAtAllIndices(T* squareBlockVector, const size_t numberOfRows, const size_t blocksize, T* vec);
+void weightedDiagMV(T* squareBlockVector,
+                    const size_t numberOfRows,
+                    const size_t blocksize,
+                    T relaxation_factor,
+                    const T* src_vec,
+                    T* dst_vec);
 } // namespace Opm::cuistl::detail
 #endif
--- a/tests/cuistl/test_cuVector_operations.cpp
+++ b/tests/cuistl/test_cuVector_operations.cpp
@@ -42,16 +42,19 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf3By3BlockVectorAndVecto

    const size_t blocksize = 3;
    const size_t N = 1;
+    const T weight = 1.0;

    std::vector<T> h_blockVector({1.0,2.0,3.0,5.0,2.0,3.0,2.0,1.0,2.0});
    std::vector<T> h_vecVector({3.0,2.0,1.0});
+    std::vector<T> h_dstVector({0, 0, 0});
    Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
    Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
+    Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);

-    Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
+    Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());

    std::vector<T> expected_vec{10.0,22.0,10.0};
-    std::vector<T> computed_vec = d_vecVector.asStdVector();
+    std::vector<T> computed_vec = d_dstVector.asStdVector();

    BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
    for (size_t i = 0; i < expected_vec.size(); i++){
@@ -63,25 +66,28 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(ElementWiseMultiplicationOf2By2BlockVectorAndVecto
 {
    /*
        Example in the test for multiplying by element a blockvector with a vector of vectors
-        | |1 2| |   | |1| |   | | 7| |
-        | |3 4| | X | |3| | = | |15| |
-        |       |   |     |   |      |
-        | |4 3| |   | |2| |   | |20| |
-        | |2 1| |   | |4| |   | | 8| |
+             | |1 2| |   | |1| |   | | 3.5| |
+       0.5 * | |3 4| | X | |3| | = | | 7.5| |
+             |       |   |     |   |        |
+             | |4 3| |   | |2| |   | |10.0| |
+             | |2 1| |   | |4| |   | | 4.0| |
    */

    const size_t blocksize = 2;
    const size_t N = 2;
+    const T weight = 0.5;

    std::vector<T> h_blockVector({1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0});
    std::vector<T> h_vecVector({1.0,3.0,2.0,4.0});
+    std::vector<T> h_dstVector({0, 0, 0, 0});
    Opm::cuistl::CuVector<T> d_blockVector(h_blockVector);
    Opm::cuistl::CuVector<T> d_vecVector(h_vecVector);
+    Opm::cuistl::CuVector<T> d_dstVector(h_dstVector);

-    Opm::cuistl::detail::blockVectorMultiplicationAtAllIndices(d_blockVector.data(), N, blocksize, d_vecVector.data());
+    Opm::cuistl::detail::weightedDiagMV(d_blockVector.data(), N, blocksize, weight, d_vecVector.data(), d_dstVector.data());

-    std::vector<T> expected_vec{7.0,15.0,20.0,8.0};
-    std::vector<T> computed_vec = d_vecVector.asStdVector();
+    std::vector<T> expected_vec{3.5,7.5,10.0,4.0};
+    std::vector<T> computed_vec = d_dstVector.asStdVector();

    BOOST_REQUIRE_EQUAL(expected_vec.size(), computed_vec.size());
    for (size_t i = 0; i < expected_vec.size(); i++){
--- a/tests/cuistl/test_cujac.cpp
+++ b/tests/cuistl/test_cujac.cpp
@@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(CUJACApplyBlocksize2, T, NumericTypes)
    h_dune_b[1][0] = 3.0;
    h_dune_b[1][1] = 4.0;
    
-    const T expected_ans[2][2] = {{1.0, 0.0},{-1.0,-3.0/2.0}};
+    const T expected_ans[2][2] = {{0.0, 5.0/2.0},{-3.0/2.0, -2.0}};

    CuJac.apply(h_dune_x, h_dune_b);
    BOOST_CHECK_CLOSE(h_dune_x[0][0], expected_ans[0][0], 1e-7);