From c4ebed1718df8ed6a5c076428940fa06b7741963 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Wed, 22 Jun 2022 14:29:03 +0200 Subject: [PATCH 1/3] Backport to fix ChowPatelIlu --- opm/simulators/linalg/bda/opencl/BILU0.cpp | 8 ++++---- opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl | 1 + opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index b63f5959b..be1066652 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -136,8 +136,8 @@ bool BILU0::analyze_matrix(BlockedMatrix *mat, BlockedMatrix *jacMat invDiagVals.resize(mat->Nb * bs * bs); #if CHOW_PATEL - Lmat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); - Umat = std::make_unique >(mat->Nb, (mat->nnzbs - mat->Nb) / 2); + Lmat = std::make_unique(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size); + Umat = std::make_unique(mat->Nb, (mat->nnzbs - mat->Nb) / 2, block_size); #endif s.invDiagVals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * mat->Nb); @@ -223,9 +223,9 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix } #if CHOW_PATEL - chowPatelIlu.decomposition(queue, context, + chowPatelIlu.decomposition(queue.get(), context.get(), LUmat.get(), Lmat.get(), Umat.get(), - invDiagVals, diagIndex, + invDiagVals.data(), diagIndex, s.diagIndex, s.invDiagVals, s.Lvals, s.Lcols, s.Lrows, s.Uvals, s.Ucols, s.Urows); diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl index fb067995d..6efa8e30b 100644 --- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1.cl @@ -1,6 +1,7 @@ /// ILU apply part 1: forward substitution. /// Solves L*x=y where L is a lower triangular sparse blocked matrix. /// Here, L is it's own BSR matrix. +/// Only used with ChowPatelIlu. __kernel void ILU_apply1( __global const double *LUvals, __global const unsigned int *LUcols, diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl index 315488f58..6300740c5 100644 --- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2.cl @@ -1,6 +1,7 @@ /// ILU apply part 2: backward substitution. /// Solves U*x=y where U is an upper triangular sparse blocked matrix. /// Here, U is it's own BSR matrix. +/// Only used with ChowPatelIlu. __kernel void ILU_apply2( __global const double *LUvals, __global const int *LUcols, From 842a9a93177828c3bee98bdf85057c4e7bbfd6cc Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Thu, 23 Jun 2022 10:17:59 +0200 Subject: [PATCH 2/3] Reduce source code line lengths --- .../linalg/bda/opencl/openclKernels.cpp | 61 ++++++++++++++----- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.cpp b/opm/simulators/linalg/bda/opencl/openclKernels.cpp index 67cf97827..08d670b8e 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp @@ -251,7 +251,8 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c } } -void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N) +void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, + const double omega, const double beta, int N) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(N, work_group_size); @@ -319,7 +320,9 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con } } -void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, const cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size, bool reset, bool add) +void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, + const cl::Buffer& x, cl::Buffer& b, int Nb, + unsigned int block_size, bool reset, bool add) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -330,15 +333,19 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, c if (block_size > 1) { if (add) { - event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); } else { - event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); + event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); } } else { if (reset) { - event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); + event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); } else { - event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); + event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); } } @@ -350,7 +357,9 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, c } } -void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, const cl::Buffer& rhs, cl::Buffer& out, int Nb, unsigned int block_size) +void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, + cl::Buffer& x, const cl::Buffer& rhs, + cl::Buffer& out, int Nb, unsigned int block_size) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -360,9 +369,11 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row cl::Event event; if (block_size > 1) { - event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group)); + event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group)); } else { - event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group)); + event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group)); } if (verbosity >= 4) { @@ -373,7 +384,11 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row } } -void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) +void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, + cl::Buffer& rows, cl::Buffer& diagIndex, + const cl::Buffer& y, cl::Buffer& x, + cl::Buffer& rowsPerColor, int color, + int Nb, unsigned int block_size) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -381,7 +396,10 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_ilu_apply1; - cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, y, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, diagIndex, y, x, + rowsPerColor, color, block_size, + cl::Local(lmem_per_work_group)); if (verbosity >= 5) { event.wait(); @@ -391,7 +409,11 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r } } -void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size) +void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, + cl::Buffer& rows, cl::Buffer& diagIndex, + cl::Buffer& invDiagVals, cl::Buffer& x, + cl::Buffer& rowsPerColor, int color, + int Nb, unsigned int block_size) { const unsigned int work_group_size = 32; const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); @@ -399,7 +421,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_ilu_apply2; - cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, invDiagVals, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group)); + cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), + vals, cols, rows, diagIndex, invDiagVals, + x, rowsPerColor, color, block_size, + cl::Local(lmem_per_work_group)); if (verbosity >= 5) { event.wait(); @@ -409,7 +434,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& r } } -void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size) +void OpenclKernels::ILU_decomp(int firstRow, int lastRow, + cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, + cl::Buffer& diagIndex, cl::Buffer& invDiagVals, + int Nb, unsigned int block_size) { const unsigned int work_group_size2 = 128; const unsigned int num_work_groups2 = 1024; @@ -418,7 +446,10 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl:: const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot Timer t_ilu_decomp; - cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, vals, cols, rows, invDiagVals, diagIndex, Nb, cl::Local(lmem_per_work_group2)); + cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), + firstRow, lastRow, vals, cols, rows, + invDiagVals, diagIndex, Nb, + cl::Local(lmem_per_work_group2)); if (verbosity >= 4) { event.wait(); From 85776f528a01b6744d710c013ac1eee7313c5953 Mon Sep 17 00:00:00 2001 From: Tong Dong Qiu Date: Thu, 23 Jun 2022 10:22:48 +0200 Subject: [PATCH 3/3] Tweak ILU opencl kernel launch params --- opm/simulators/linalg/bda/opencl/BILU0.cpp | 10 +++--- .../bda/opencl/kernels/ILU_apply1_fm.cl | 2 +- .../bda/opencl/kernels/ILU_apply2_fm.cl | 2 +- .../linalg/bda/opencl/openclKernels.cpp | 36 +++++++++++-------- .../linalg/bda/opencl/openclKernels.hpp | 1 + 5 files changed, 29 insertions(+), 22 deletions(-) diff --git a/opm/simulators/linalg/bda/opencl/BILU0.cpp b/opm/simulators/linalg/bda/opencl/BILU0.cpp index be1066652..e01d8e925 100644 --- a/opm/simulators/linalg/bda/opencl/BILU0.cpp +++ b/opm/simulators/linalg/bda/opencl/BILU0.cpp @@ -274,7 +274,7 @@ bool BILU0::create_preconditioner(BlockedMatrix *mat, BlockedMatrix if (verbosity >= 5) { out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; } - OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size); + OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, rowsPerColor[color], block_size); } if (verbosity >= 3) { @@ -299,17 +299,17 @@ void BILU0::apply(const cl::Buffer& y, cl::Buffer& x) for (int color = 0; color < numColors; ++color) { #if CHOW_PATEL - OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size); + OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, rowsPerColor[color], block_size); #else - OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size); + OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, rowsPerColor[color], block_size); #endif } for (int color = numColors - 1; color >= 0; --color) { #if CHOW_PATEL - OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size); + OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, rowsPerColor[color], block_size); #else - OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size); + OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, rowsPerColor[color], block_size); #endif } diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl index 41d46e381..483d4e7f2 100644 --- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply1_fm.cl @@ -14,7 +14,7 @@ __kernel void ILU_apply1( const unsigned int block_size, __local double *tmp) { - const unsigned int warpsize = 32; + const unsigned int warpsize = get_local_size(0); const unsigned int bs = block_size; const unsigned int idx_t = get_local_id(0); const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; diff --git a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl index f888b826e..d910552b6 100644 --- a/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl +++ b/opm/simulators/linalg/bda/opencl/kernels/ILU_apply2_fm.cl @@ -14,7 +14,7 @@ __kernel void ILU_apply2( const unsigned int block_size, __local double *tmp) { - const unsigned int warpsize = 32; + const unsigned int warpsize = get_local_size(0); const unsigned int bs = block_size; const unsigned int idx_t = get_local_id(0); const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs; diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.cpp b/opm/simulators/linalg/bda/opencl/openclKernels.cpp index 08d670b8e..d3f7f8861 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.cpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.cpp @@ -41,6 +41,7 @@ int OpenclKernels::verbosity; cl::CommandQueue *OpenclKernels::queue; std::vector OpenclKernels::tmp; bool OpenclKernels::initialized = false; +size_t OpenclKernels::preferred_workgroup_size = 0; std::unique_ptr > OpenclKernels::dot_k; std::unique_ptr > OpenclKernels::norm_k; @@ -140,6 +141,11 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL"))); isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU"))); + // testing shows all kernels have the same preferred_workgroup_size_multiple + // 32 for NVIDIA + // 64 for AMD + cl::Kernel(program, "ILU_apply1").getWorkGroupInfo(devices[0], CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferred_workgroup_size_multiple); + initialized = true; } // end get_opencl_kernels() @@ -388,10 +394,10 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, - int Nb, unsigned int block_size) + int rowsThisColor, unsigned int block_size) { - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int work_group_size = preferred_workgroup_size_multiple; + const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_ilu_apply1; @@ -413,10 +419,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, - int Nb, unsigned int block_size) + int rowsThisColor, unsigned int block_size) { - const unsigned int work_group_size = 32; - const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); + const unsigned int work_group_size = preferred_workgroup_size_multiple; + const unsigned int num_work_groups = rowsThisColor; const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; Timer t_ilu_apply2; @@ -437,19 +443,19 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, - int Nb, unsigned int block_size) + int rowsThisColor, unsigned int block_size) { - const unsigned int work_group_size2 = 128; - const unsigned int num_work_groups2 = 1024; - const unsigned int total_work_items2 = num_work_groups2 * work_group_size2; - const unsigned int num_hwarps_per_group = work_group_size2 / 16; - const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot + const unsigned int work_group_size = 128; + const unsigned int num_work_groups = rowsThisColor; + const unsigned int total_work_items = num_work_groups * work_group_size; + const unsigned int num_hwarps_per_group = work_group_size / 16; + const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot Timer t_ilu_decomp; - cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), + cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), firstRow, lastRow, vals, cols, rows, - invDiagVals, diagIndex, Nb, - cl::Local(lmem_per_work_group2)); + invDiagVals, diagIndex, rowsThisColor, + cl::Local(lmem_per_work_group)); if (verbosity >= 4) { event.wait(); diff --git a/opm/simulators/linalg/bda/opencl/openclKernels.hpp b/opm/simulators/linalg/bda/opencl/openclKernels.hpp index c56b748fb..780f295d7 100644 --- a/opm/simulators/linalg/bda/opencl/openclKernels.hpp +++ b/opm/simulators/linalg/bda/opencl/openclKernels.hpp @@ -64,6 +64,7 @@ private: static cl::CommandQueue *queue; static std::vector tmp; // used as tmp CPU buffer for dot() and norm() static bool initialized; + static size_t preferred_workgroup_size; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE static std::unique_ptr > dot_k; static std::unique_ptr > norm_k;