Simplify and optimize the Jacobi preconditioner

apply. Fix .h and .cpp mistakes in cmake.
2025-02-25 18:55:30 -06:00 · 2023-10-12 13:07:56 +02:00
parent 1e4b0e97ee
commit fbeb006d45
4 changed files with 15 additions and 57 deletions
--- a/opm/simulators/linalg/cuistl/CuJac.cpp
+++ b/opm/simulators/linalg/cuistl/CuJac.cpp
@@ -49,10 +49,6 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
    , relaxation_factor(w)
    , cuMatrix(CuSparseMatrix<field_type>::fromMatrix(detail::makeMatrixWithNonzeroDiagonal(A)))
    , cuVec_diagInvFlattened(cuMatrix.N() * cuMatrix.blockSize() * cuMatrix.blockSize())
-    , cuVec_resultBuffer(cuMatrix.N() * cuMatrix.blockSize())
-    , cuMatrixDescription(detail::createMatrixDescription())
-    , cuSparseHandle(detail::CuSparseHandle::getInstance())
-    , cuBlasHandle(detail::CuBlasHandle::getInstance())
 {
    // Some sanity check
    OPM_ERROR_IF(A.N() != cuMatrix.N(),
@@ -86,53 +82,25 @@ CuJac<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)

 template <class M, class X, class Y, int l>
 void
-CuJac<M, X, Y, l>::apply(X& x, const Y& b)
-{
-    // x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
-    // cusparseDbsrmv computes -Ax_n + b
-    // cuda kernel computes D^-1 * (b- Ax_n), call this rhs
-    // use an axpy to compute x + w*rhs
-
-    const field_type one = 1.0, neg_one = -1.0;
+CuJac<M, X, Y, l>::apply(X& v, const Y& d)
+{   
+    // Jacobi preconditioner: x_{n+1} = x_n + w * (D^-1 * (b - Ax_n) )
+    // Working with defect d and update v it we only need to set v = w*(D^-1)*d

    const auto numberOfRows = detail::to_int(cuMatrix.N());
-    const auto numberOfColumns = detail::to_int(cuMatrix.N());
    const auto numberOfNonzeroBlocks = detail::to_int(cuMatrix.nonzeroes());
    const auto blockSize = detail::to_int(cuMatrix.blockSize());

-    auto nonZeroValues = cuMatrix.getNonZeroValues().data();
-    auto rowIndices = cuMatrix.getRowIndices().data();
-    auto columnIndices = cuMatrix.getColumnIndices().data();
+    // Could probably be further optimized a little bit by fusing the copy with the 
+    // following multiplication. To do so add a kernel that does not do the multiplication
+    // in place.

-    // TODO: avoid making this copy, currently forced to because parameter is a const
-    cuVec_resultBuffer = b;
-
-    // bsrmv computes b - Ax
-    OPM_CUSPARSE_SAFE_CALL(detail::cusparseBsrmv(cuSparseHandle.get(),
-                                                 detail::CUSPARSE_MATRIX_ORDER,
-                                                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                 numberOfRows,
-                                                 numberOfColumns,
-                                                 numberOfNonzeroBlocks,
-                                                 &neg_one,
-                                                 cuMatrixDescription->get(),
-                                                 nonZeroValues,
-                                                 rowIndices,
-                                                 columnIndices,
-                                                 blockSize,
-                                                 x.data(),
-                                                 &one,
-                                                 cuVec_resultBuffer.data()));
-
-    // multiply D^-1 with (b-Ax)
+    v = d; // cuda copy
    detail::blockVectorMultiplicationAtAllIndices(cuVec_diagInvFlattened.data(),
                                                  detail::to_size_t(numberOfRows),
                                                  detail::to_size_t(blockSize),
-                                                  cuVec_resultBuffer.data());
-
-    // Finish adding w*rhs to x
-    OPM_CUBLAS_SAFE_CALL(detail::cublasAxpy(
-        cuBlasHandle.get(), numberOfRows * blockSize, &relaxation_factor, cuVec_resultBuffer.data(), 1, x.data(), 1));
+                                                  v.data());
+    v *= relaxation_factor; // cuBlas axpy
 }

 template <class M, class X, class Y, int l>
--- a/opm/simulators/linalg/cuistl/CuJac.hpp
+++ b/opm/simulators/linalg/cuistl/CuJac.hpp
@@ -102,17 +102,10 @@ private:
    const M& cpuMatrix;
    //! \brief The relaxation factor to use.
    field_type relaxation_factor;
-
+    //! \brief The A matrix stored on the gpu
    CuSparseMatrix<field_type> cuMatrix;
-    
-    CuVector<field_type> cuVec_diagInvFlattened; // the diagonal of cuMatrix inverted, and then flattened to fit in a vector
-    CuVector<field_type> cuVec_resultBuffer;
-
-    detail::CuSparseMatrixDescriptionPtr cuMatrixDescription;
-    detail::CuSparseHandle& cuSparseHandle;
-    detail::CuBlasHandle& cuBlasHandle;
-    
-    size_t findBufferSize();
+    //! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
+    CuVector<field_type> cuVec_diagInvFlattened;
 };
 } // end namespace Opm::cuistl

--- a/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
+++ b/opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp
@@ -80,10 +80,6 @@ public:
            m_inputBuffer.reset(new CuVector<field_type>(v.dim()));
            m_outputBuffer.reset(new CuVector<field_type>(v.dim()));
        }
-        //TODO: to implement CuJac I added the moving of data from host to the output buffer
-        // This has performance impact for other preconditiners, so figure out how to avoid
-        m_outputBuffer->copyFromHost(v);
-
        m_inputBuffer->copyFromHost(d);
        m_underlyingPreconditioner->apply(*m_outputBuffer, *m_inputBuffer);
        m_outputBuffer->copyToHost(v);