diff --git a/opencl-source-provider.cmake b/opencl-source-provider.cmake index 07567ead7..7581385b7 100644 --- a/opencl-source-provider.cmake +++ b/opencl-source-provider.cmake @@ -31,7 +31,7 @@ endif() foreach(CL ${CL_LIST}) get_filename_component(FNAME ${CL} NAME_WE) - file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n") + file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels::${FNAME}_str = R\"\( \n") file(READ "${CL}" CL_CONTENT) file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}") file(APPEND ${CL_SRC_FILE} "\)\"; \n\n") diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index aa7417e35..81f3be4ec 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -244,9 +244,9 @@ create_preconditioner(BlockedMatrix* mat, if (verbosity >= 5) { out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; } - OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices, - s.LUvals, s.LUcols, s.LUrows, s.diagIndex, - s.invDiagVals, rowsPerColor[color], block_size); + OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices, + s.LUvals, s.LUcols, s.LUrows, s.diagIndex, + s.invDiagVals, rowsPerColor[color], block_size); } if (verbosity >= 3) { @@ -272,30 +272,30 @@ void BILU0::apply(const cl::Buffer& y, cl::Buffer& x) for (int color = 0; color < numColors; ++color) { #if CHOW_PATEL - OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows, - s.diagIndex, y, x, s.rowsPerColor, - color, rowsPerColor[color], block_size); + OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows, + s.diagIndex, y, x, s.rowsPerColor, + color, rowsPerColor[color], block_size); #else - OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, - s.diagIndex, y, x, s.rowsPerColor, - color, rowsPerColor[color], block_size); + OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, + s.diagIndex, y, x, s.rowsPerColor, + color, rowsPerColor[color], block_size); #endif } for (int color = numColors - 1; color >= 0; --color) { #if CHOW_PATEL - OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows, - s.diagIndex, s.invDiagVals, x, s.rowsPerColor, - color, rowsPerColor[color], block_size); + OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows, + s.diagIndex, s.invDiagVals, x, s.rowsPerColor, + color, rowsPerColor[color], block_size); #else - OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, - s.diagIndex, s.invDiagVals, x, s.rowsPerColor, - color, rowsPerColor[color], block_size); + OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, + s.diagIndex, s.invDiagVals, x, s.rowsPerColor, + color, rowsPerColor[color], block_size); #endif } // apply relaxation - OpenclKernels::scale(x, relaxation, N); + OpenclKernels::scale(x, relaxation, N); if (verbosity >= 4) { std::ostringstream out; diff --git a/opm/simulators/linalg/bda/opencl/BISAI.cpp b/opm/simulators/linalg/bda/opencl/BISAI.cpp index fd62e9320..ffb6497f6 100644 --- a/opm/simulators/linalg/bda/opencl/BISAI.cpp +++ b/opm/simulators/linalg/bda/opencl/BISAI.cpp @@ -263,8 +263,14 @@ create_preconditioner(BlockedMatrix* mat, cl::WaitForEvents(events); events.clear(); - OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb); - OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals, + OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, + d_lower.subsystemPointers, d_lower.nzIndices, + d_lower.unknownRhsIndices, d_lower.knownRhsIndices, + d_LUvals, d_invLvals, Nb); + OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, + d_csrToCscOffsetMap, d_upper.subsystemPointers, + d_upper.nzIndices, d_upper.unknownRhsIndices, + d_upper.knownRhsIndices, d_LUvals, d_invDiagVals, d_invUvals, Nb); if(verbosity >= 4){ @@ -286,10 +292,12 @@ template void BISAI::apply(const cl::Buffer& x, cl::Buffer& y){ const unsigned int bs = block_size; - OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition - // (to compensate for the unitary diagonal that is not - // included in isaiL, for simplicity) - OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv + OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, + x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition + // (to compensate for the unitary diagonal that is not + // included in isaiL, for simplicity) + OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, + d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv } #define INSTANTIATE_BDA_FUNCTIONS(n) \ diff --git a/opm/simulators/linalg/bda/opencl/CPR.cpp b/opm/simulators/linalg/bda/opencl/CPR.cpp index 53eff8be1..6ef60e0c2 100644 --- a/opm/simulators/linalg/bda/opencl/CPR.cpp +++ b/opm/simulators/linalg/bda/opencl/CPR.cpp @@ -494,20 +494,21 @@ void CPR::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer& // presmooth double jacobi_damping = 0.65; // default value in amgcl: 0.72 for (unsigned i = 0; i < num_pre_smooth_steps; ++i){ - OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); - OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); + OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); + OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); } // move to coarser level - OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); - OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true); + OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); + OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true); amg_cycle_gpu(level + 1, f, u); - OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur); + OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur); // postsmooth for (unsigned i = 0; i < num_post_smooth_steps; ++i){ - OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); - OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); + OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, + x, y, t, Ncur, 1); + OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); } } @@ -528,12 +529,13 @@ void CPR::apply_amg(const cl::Buffer& y, cl::Buffer& x) { OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error"); } - OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size); - OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb); + OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, + d_mat->rowPointers, x, y, *d_rs, Nb, block_size); + OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb); amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x); - OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb); + OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb); } template diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.cpp b/opm/simulators/linalg/bda/opencl/openclKernels.cpp index 77b0d7b4c..9c540f9ca 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp @@ -18,52 +18,71 @@ */ #include -#include -#include +#include #include #include #include -#include #include // defines CHOW_PATEL -namespace Opm -{ -namespace Accelerator -{ +#include +#include + +namespace Opm::Accelerator { using Opm::OpmLog; using Dune::Timer; // define static variables and kernels -int OpenclKernels::verbosity; -cl::CommandQueue *OpenclKernels::queue; -std::vector OpenclKernels::tmp; -bool OpenclKernels::initialized = false; -std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0; +template int OpenclKernels::verbosity; +template cl::CommandQueue* OpenclKernels::queue; +template std::vector OpenclKernels::tmp; +template bool OpenclKernels::initialized = false; +template std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0; -std::unique_ptr > OpenclKernels::dot_k; -std::unique_ptr > OpenclKernels::norm_k; -std::unique_ptr > OpenclKernels::axpy_k; -std::unique_ptr > OpenclKernels::scale_k; -std::unique_ptr > OpenclKernels::vmul_k; -std::unique_ptr > OpenclKernels::custom_k; -std::unique_ptr > OpenclKernels::full_to_pressure_restriction_k; -std::unique_ptr > OpenclKernels::add_coarse_pressure_correction_k; -std::unique_ptr > OpenclKernels::prolongate_vector_k; -std::unique_ptr OpenclKernels::spmv_blocked_k; -std::unique_ptr OpenclKernels::spmv_blocked_add_k; -std::unique_ptr OpenclKernels::spmv_k; -std::unique_ptr OpenclKernels::spmv_noreset_k; -std::unique_ptr OpenclKernels::residual_blocked_k; -std::unique_ptr OpenclKernels::residual_k; -std::unique_ptr OpenclKernels::ILU_apply1_k; -std::unique_ptr OpenclKernels::ILU_apply2_k; -std::unique_ptr OpenclKernels::stdwell_apply_k; -std::unique_ptr OpenclKernels::ilu_decomp_k; -std::unique_ptr OpenclKernels::isaiL_k; -std::unique_ptr OpenclKernels::isaiU_k; +template +std::unique_ptr > OpenclKernels::dot_k; +template +std::unique_ptr > OpenclKernels::norm_k; +template +std::unique_ptr > OpenclKernels::axpy_k; +template +std::unique_ptr > OpenclKernels::scale_k; +template +std::unique_ptr > OpenclKernels::vmul_k; +template +std::unique_ptr > OpenclKernels::custom_k; +template +std::unique_ptr > OpenclKernels::full_to_pressure_restriction_k; +template +std::unique_ptr > OpenclKernels::add_coarse_pressure_correction_k; +template +std::unique_ptr > OpenclKernels::prolongate_vector_k; +template +std::unique_ptr OpenclKernels::spmv_blocked_k; +template +std::unique_ptr OpenclKernels::spmv_blocked_add_k; +template +std::unique_ptr OpenclKernels::spmv_k; +template +std::unique_ptr OpenclKernels::spmv_noreset_k; +template +std::unique_ptr OpenclKernels::residual_blocked_k; +template +std::unique_ptr OpenclKernels::residual_k; +template +std::unique_ptr OpenclKernels::ILU_apply1_k; +template +std::unique_ptr OpenclKernels::ILU_apply2_k; +template +std::unique_ptr OpenclKernels::stdwell_apply_k; +template +std::unique_ptr OpenclKernels::ilu_decomp_k; +template +std::unique_ptr OpenclKernels::isaiL_k; +template +std::unique_ptr OpenclKernels::isaiU_k; // divide A by B, and round up: return (int)ceil(A/B) unsigned int ceilDivision(const unsigned int A, const unsigned int B) @@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B) return A / B + (A % B > 0); } -void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector& devices, int verbosity_) +template +void OpenclKernels::init(cl::Context *context, + cl::CommandQueue *queue_, + std::vector& devices, int verbosity_) { if (initialized) { OpmLog::debug("Warning OpenclKernels is already initialized"); @@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve // actually creating the kernels dot_k.reset(new cl::KernelFunctor(cl::Kernel(program, "dot_1"))); norm_k.reset(new cl::KernelFunctor(cl::Kernel(program, "norm"))); - axpy_k.reset(new cl::KernelFunctor(cl::Kernel(program, "axpy"))); - scale_k.reset(new cl::KernelFunctor(cl::Kernel(program, "scale"))); - vmul_k.reset(new cl::KernelFunctor(cl::Kernel(program, "vmul"))); - custom_k.reset(new cl::KernelFunctor(cl::Kernel(program, "custom"))); + axpy_k.reset(new cl::KernelFunctor(cl::Kernel(program, "axpy"))); + scale_k.reset(new cl::KernelFunctor(cl::Kernel(program, "scale"))); + vmul_k.reset(new cl::KernelFunctor(cl::Kernel(program, "vmul"))); + custom_k.reset(new cl::KernelFunctor(cl::Kernel(program, "custom"))); full_to_pressure_restriction_k.reset(new cl::KernelFunctor(cl::Kernel(program, "full_to_pressure_restriction"))); add_coarse_pressure_correction_k.reset(new cl::KernelFunctor(cl::Kernel(program, "add_coarse_pressure_correction"))); prolongate_vector_k.reset(new cl::KernelFunctor(cl::Kernel(program, "prolongate_vector"))); @@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve initialized = true; } // end get_opencl_kernels() -double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) +template +Scalar OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) { const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size; Timer t_dot; tmp.resize(num_work_groups); cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group)); - queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data()); + queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data()); - double gpu_sum = 0.0; + Scalar gpu_sum = 0.0; for (unsigned int i = 0; i < num_work_groups; ++i) { gpu_sum += tmp[i]; } @@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int return gpu_sum; } -double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N) +template +Scalar OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N) { const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size; Timer t_norm; tmp.resize(num_work_groups); cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group)); - queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data()); + queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data()); - double gpu_norm = 0.0; + Scalar gpu_norm = 0.0; for (unsigned int i = 0; i < num_work_groups; ++i) { gpu_norm += tmp[i]; } @@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N) return gpu_norm; } -void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N) +template +void OpenclKernels::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); @@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N) } } -void OpenclKernels::scale(cl::Buffer& in, const double a, int N) +template +void OpenclKernels::scale(cl::Buffer& in, const Scalar a, int N) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); @@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N) } } -void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) +template +void OpenclKernels::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); @@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c } } -void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, - const double omega, const double beta, int N) +template +void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, + const Scalar omega, const Scalar beta, int N) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); @@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, } } -void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb) +template +void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B } } -void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb) +template +void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf } } -void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N) +template +void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); @@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con } } -void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, - const cl::Buffer& x, cl::Buffer& b, int Nb, - unsigned int block_size, bool reset, bool add) +template +void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, + const cl::Buffer& x, cl::Buffer& b, int Nb, + unsigned int block_size, bool reset, bool add) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size; Timer t_spmv; cl::Event event; if (block_size > 1) { if (add) { event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); } else { event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); } } else { if (reset) { event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); + vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); } else { event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); + vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); } } @@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, } } -void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, - cl::Buffer& x, const cl::Buffer& rhs, - cl::Buffer& out, int Nb, unsigned int block_size) +template +void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, + cl::Buffer& x, const cl::Buffer& rhs, + cl::Buffer& out, int Nb, unsigned int block_size) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size; Timer t_residual; cl::Event event; if (block_size > 1) { event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group)); + vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group)); } else { event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group)); + vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group)); } if (verbosity >= 4) { @@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row } } -void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, - cl::Buffer& rows, cl::Buffer& diagIndex, - const cl::Buffer& y, cl::Buffer& x, - cl::Buffer& rowsPerColor, int color, - int rowsThisColor, unsigned int block_size) +template +void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, + cl::Buffer& rows, cl::Buffer& diagIndex, + const cl::Buffer& y, cl::Buffer& x, + cl::Buffer& rowsPerColor, int color, + int rowsThisColor, unsigned int block_size) { const unsigned int work_group_size = preferred_workgroup_size_multiple; const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size; Timer t_ilu_apply1; cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - rowIndices, vals, cols, rows, diagIndex, - y, x, rowsPerColor, color, block_size, - cl::Local(lmem_per_work_group)); + rowIndices, vals, cols, rows, diagIndex, + y, x, rowsPerColor, color, block_size, + cl::Local(lmem_per_work_group)); if (verbosity >= 5) { event.wait(); @@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf } } -void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, - cl::Buffer& rows, cl::Buffer& diagIndex, - cl::Buffer& invDiagVals, cl::Buffer& x, - cl::Buffer& rowsPerColor, int color, - int rowsThisColor, unsigned int block_size) +template +void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, + cl::Buffer& rows, cl::Buffer& diagIndex, + cl::Buffer& invDiagVals, cl::Buffer& x, + cl::Buffer& rowsPerColor, int color, + int rowsThisColor, unsigned int block_size) { const unsigned int work_group_size = preferred_workgroup_size_multiple; const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; - const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; + const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size; Timer t_ilu_apply2; cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - rowIndices, vals, cols, rows, diagIndex, - invDiagVals, x, rowsPerColor, color, block_size, - cl::Local(lmem_per_work_group)); + rowIndices, vals, cols, rows, diagIndex, + invDiagVals, x, rowsPerColor, color, block_size, + cl::Local(lmem_per_work_group)); if (verbosity >= 5) { event.wait(); @@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf } } -void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, - cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, - cl::Buffer& diagIndex, cl::Buffer& invDiagVals, - int rowsThisColor, unsigned int block_size) +template +void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, + cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, + cl::Buffer& diagIndex, cl::Buffer& invDiagVals, + int rowsThisColor, unsigned int block_size) { const unsigned int work_group_size = 128; const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int num_hwarps_per_group = work_group_size / 16; - const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot + const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar); // each block needs a pivot Timer t_ilu_decomp; cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - firstRow, lastRow, rowIndices, - vals, cols, rows, - invDiagVals, diagIndex, rowsThisColor, - cl::Local(lmem_per_work_group)); + firstRow, lastRow, rowIndices, + vals, cols, rows, + invDiagVals, diagIndex, rowsThisColor, + cl::Local(lmem_per_work_group)); if (verbosity >= 4) { event.wait(); @@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices } } -void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl, - cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y, - int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells) +template +void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl, + cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y, + int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells) { const unsigned int work_group_size = 32; const unsigned int total_work_items = num_std_wells * work_group_size; - const unsigned int lmem1 = sizeof(double) * work_group_size; - const unsigned int lmem2 = sizeof(double) * dim_wells; + const unsigned int lmem1 = sizeof(Scalar) * work_group_size; + const unsigned int lmem2 = sizeof(Scalar) * dim_wells; Timer t_apply_stdwells; cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl, - cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); + d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl, + cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); if (verbosity >= 4) { event.wait(); @@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ } } -void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc, - cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb) +template +void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc, + cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb) { const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu Timer t_isaiL; cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb); + diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb); if (verbosity >= 4) { event.wait(); @@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu } } -void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping, - cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, - cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb) +template +void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping, + cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, + cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb) { const unsigned int work_group_size = 256; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu Timer t_isaiU; cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), - diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb); + diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb); if (verbosity >= 4) { event.wait(); @@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu } } -} // namespace Accelerator -} // namespace Opm +template class OpenclKernels; + +} // namespace Opm::Accelerator diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.hpp b/opm/simulators/linalg/bda/opencl/openclKernels.hpp index de803cc47..18f898a07 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.hpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.hpp @@ -26,10 +26,7 @@ #include -namespace Opm -{ -namespace Accelerator -{ +namespace Opm::Accelerator { using spmv_blocked_kernel_type = cl::KernelFunctor; @@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor; +template class OpenclKernels { private: static int verbosity; static cl::CommandQueue *queue; - static std::vector tmp; // used as tmp CPU buffer for dot() and norm() + static std::vector tmp; // used as tmp CPU buffer for dot() and norm() static bool initialized; static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE static std::unique_ptr > dot_k; static std::unique_ptr > norm_k; - static std::unique_ptr > axpy_k; - static std::unique_ptr > scale_k; - static std::unique_ptr > vmul_k; - static std::unique_ptr > custom_k; + static std::unique_ptr > axpy_k; + static std::unique_ptr > scale_k; + static std::unique_ptr > vmul_k; + static std::unique_ptr > custom_k; static std::unique_ptr > full_to_pressure_restriction_k; static std::unique_ptr > add_coarse_pressure_correction_k; static std::unique_ptr > prolongate_vector_k; @@ -117,12 +115,12 @@ public: static void init(cl::Context *context, cl::CommandQueue *queue, std::vector& devices, int verbosity); - static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); - static double norm(cl::Buffer& in, cl::Buffer& out, int N); - static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N); - static void scale(cl::Buffer& in, const double a, int N); - static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); - static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N); + static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); + static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N); + static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N); + static void scale(cl::Buffer& in, const Scalar a, int N); + static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); + static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N); static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb); static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb); static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N); @@ -150,7 +148,40 @@ public: cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb); }; -} // namespace Accelerator -} // namespace Opm +#if CHOW_PATEL +#define DECLARE_ILU(T) \ + template<> const std::string OpenclKernels::ILU_apply1_str; \ + template<> const std::string OpenclKernels::ILU_apply2_str; +#else +#define DECLARE_ILU(T) \ + template<> const std::string OpenclKernels::ILU_apply1_fm_str; \ + template<> const std::string OpenclKernels::ILU_apply2_fm_str; +#endif + +#define DECLARE_INSTANCE(T) \ + DECLARE_ILU(T) \ + template<> const std::string OpenclKernels::axpy_str; \ + template<> const std::string OpenclKernels::scale_str; \ + template<> const std::string OpenclKernels::vmul_str; \ + template<> const std::string OpenclKernels::dot_1_str; \ + template<> const std::string OpenclKernels::norm_str; \ + template<> const std::string OpenclKernels::custom_str; \ + template<> const std::string OpenclKernels::full_to_pressure_restriction_str; \ + template<> const std::string OpenclKernels::add_coarse_pressure_correction_str; \ + template<> const std::string OpenclKernels::prolongate_vector_str; \ + template<> const std::string OpenclKernels::spmv_blocked_str; \ + template<> const std::string OpenclKernels::spmv_blocked_add_str; \ + template<> const std::string OpenclKernels::spmv_str; \ + template<> const std::string OpenclKernels::spmv_noreset_str; \ + template<> const std::string OpenclKernels::residual_blocked_str; \ + template<> const std::string OpenclKernels::residual_str; \ + template<> const std::string OpenclKernels::stdwell_apply_str; \ + template<> const std::string OpenclKernels::ILU_decomp_str; \ + template<> const std::string OpenclKernels::isaiL_str; \ + template<> const std::string OpenclKernels::isaiU_str; + +DECLARE_INSTANCE(double) + +} // namespace Opm::Accelerator #endif diff --git a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp index 34e458a03..1e34ee83f 100644 --- a/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp +++ b/opm/simulators/linalg/bda/opencl/openclSolverBackend.cpp @@ -203,7 +203,7 @@ openclSolverBackend::openclSolverBackend(int verbosity_, int maxit_, context = std::make_shared(devices[0]); queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err)); - OpenclKernels::init(context.get(), queue.get(), devices, verbosity); + OpenclKernels::init(context.get(), queue.get(), devices, verbosity); } catch (const cl::Error& error) { std::ostringstream oss; @@ -263,7 +263,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error"); } - norm = OpenclKernels::norm(d_r, d_tmp, N); + norm = OpenclKernels::norm(d_r, d_tmp, N); norm_0 = norm; if (verbosity > 1) { @@ -277,11 +277,11 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr } for (it = 0.5; it < maxit; it += 0.5) { rhop = rho; - rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N); + rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N); if (it > 1) { beta = (rho / rhop) * (alpha / omega); - OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N); + OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N); } if (verbosity >= 3) { queue->finish(); @@ -298,7 +298,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr } // v = A * pw - OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); + OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); if (verbosity >= 3) { queue->finish(); t_spmv.stop(); @@ -315,11 +315,11 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_rest.start(); } - tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N); + tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N); alpha = rho / tmp1; - OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v - OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw - norm = OpenclKernels::norm(d_r, d_tmp, N); + OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v + OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw + norm = OpenclKernels::norm(d_r, d_tmp, N); if (verbosity >= 3) { queue->finish(); t_rest.stop(); @@ -343,7 +343,7 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr } // t = A * s - OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); + OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); if(verbosity >= 3){ queue->finish(); t_spmv.stop(); @@ -360,12 +360,12 @@ void openclSolverBackend::gpu_pbicgstab(WellContributions& wellContr t_rest.start(); } - tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N); - tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N); + tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N); + tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N); omega = tmp1 / tmp2; - OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s - OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t - norm = OpenclKernels::norm(d_r, d_tmp, N); + OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s + OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t + norm = OpenclKernels::norm(d_r, d_tmp, N); if (verbosity >= 3) { queue->finish(); t_rest.stop(); diff --git a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp index ae63c9bc6..9463daa0e 100644 --- a/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp +++ b/opm/simulators/linalg/bda/opencl/openclWellContributions.cpp @@ -36,9 +36,12 @@ void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* } -void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){ - OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, - d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells); +void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y) +{ + OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, + *d_Ccols_ocl, *d_Bcols_ocl, + d_x, d_y, dim, dim_wells, + *d_val_pointers_ocl, num_std_wells); } void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){