OpenclKernels: template Scalar type

This commit is contained in:
Arne Morten Kvarving 2024-04-15 16:45:13 +02:00
parent be59203179
commit ba1c6db855
8 changed files with 263 additions and 179 deletions

View File

@ -31,7 +31,7 @@ endif()
foreach(CL ${CL_LIST}) foreach(CL ${CL_LIST})
get_filename_component(FNAME ${CL} NAME_WE) get_filename_component(FNAME ${CL} NAME_WE)
file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n") file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels<double>::${FNAME}_str = R\"\( \n")
file(READ "${CL}" CL_CONTENT) file(READ "${CL}" CL_CONTENT)
file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}") file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
file(APPEND ${CL_SRC_FILE} "\)\"; \n\n") file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")

View File

@ -244,9 +244,9 @@ create_preconditioner(BlockedMatrix<double>* mat,
if (verbosity >= 5) { if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n"; out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
} }
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices, OpenclKernels<double>::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size); s.invDiagVals, rowsPerColor[color], block_size);
} }
if (verbosity >= 3) { if (verbosity >= 3) {
@ -272,30 +272,30 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = 0; color < numColors; ++color) { for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL #if CHOW_PATEL
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows, OpenclKernels<double>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor, s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#else #else
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, OpenclKernels<double>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor, s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#endif #endif
} }
for (int color = numColors - 1; color >= 0; --color) { for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL #if CHOW_PATEL
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows, OpenclKernels<double>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor, s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#else #else
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows, OpenclKernels<double>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor, s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size); color, rowsPerColor[color], block_size);
#endif #endif
} }
// apply relaxation // apply relaxation
OpenclKernels::scale(x, relaxation, N); OpenclKernels<double>::scale(x, relaxation, N);
if (verbosity >= 4) { if (verbosity >= 4) {
std::ostringstream out; std::ostringstream out;

View File

@ -263,8 +263,14 @@ create_preconditioner(BlockedMatrix<double>* mat,
cl::WaitForEvents(events); cl::WaitForEvents(events);
events.clear(); events.clear();
OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb); OpenclKernels<double>::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap,
OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals, d_lower.subsystemPointers, d_lower.nzIndices,
d_lower.unknownRhsIndices, d_lower.knownRhsIndices,
d_LUvals, d_invLvals, Nb);
OpenclKernels<double>::isaiU(d_diagIndex, d_colPointers, d_rowIndices,
d_csrToCscOffsetMap, d_upper.subsystemPointers,
d_upper.nzIndices, d_upper.unknownRhsIndices,
d_upper.knownRhsIndices, d_LUvals,
d_invDiagVals, d_invUvals, Nb); d_invDiagVals, d_invUvals, Nb);
if(verbosity >= 4){ if(verbosity >= 4){
@ -286,10 +292,12 @@ template <unsigned int block_size>
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){ void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
const unsigned int bs = block_size; const unsigned int bs = block_size;
OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition OpenclKernels<double>::spmv(d_invLvals, d_rowIndices, d_colPointers,
// (to compensate for the unitary diagonal that is not x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
// included in isaiL, for simplicity) // (to compensate for the unitary diagonal that is not
OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv // included in isaiL, for simplicity)
OpenclKernels<double>::spmv(d_invUvals, d_rowIndices, d_colPointers,
d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
} }
#define INSTANTIATE_BDA_FUNCTIONS(n) \ #define INSTANTIATE_BDA_FUNCTIONS(n) \

View File

@ -494,20 +494,21 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer&
// presmooth // presmooth
double jacobi_damping = 0.65; // default value in amgcl: 0.72 double jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){ for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
} }
// move to coarser level // move to coarser level
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true); OpenclKernels<double>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
amg_cycle_gpu(level + 1, f, u); amg_cycle_gpu(level + 1, f, u);
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur); OpenclKernels<double>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
// postsmooth // postsmooth
for (unsigned i = 0; i < num_post_smooth_steps; ++i){ for (unsigned i = 0; i < num_post_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1); OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers,
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur); x, y, t, Ncur, 1);
OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
} }
} }
@ -528,12 +529,13 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error"); OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
} }
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size); OpenclKernels<double>::residual(d_mat->nnzValues, d_mat->colIndices,
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb); d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
OpenclKernels<double>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x); amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb); OpenclKernels<double>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
} }
template <unsigned int block_size> template <unsigned int block_size>

View File

@ -18,52 +18,71 @@
*/ */
#include <config.h> #include <config.h>
#include <cmath> #include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
#include <sstream>
#include <opm/common/OpmLog/OpmLog.hpp> #include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp> #include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh> #include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL #include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL
namespace Opm #include <cmath>
{ #include <sstream>
namespace Accelerator
{ namespace Opm::Accelerator {
using Opm::OpmLog; using Opm::OpmLog;
using Dune::Timer; using Dune::Timer;
// define static variables and kernels // define static variables and kernels
int OpenclKernels::verbosity; template<class Scalar> int OpenclKernels<Scalar>::verbosity;
cl::CommandQueue *OpenclKernels::queue; template<class Scalar> cl::CommandQueue* OpenclKernels<Scalar>::queue;
std::vector<double> OpenclKernels::tmp; template<class Scalar> std::vector<Scalar> OpenclKernels<Scalar>::tmp;
bool OpenclKernels::initialized = false; template<class Scalar> bool OpenclKernels<Scalar>::initialized = false;
std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0; template<class Scalar> std::size_t OpenclKernels<Scalar>::preferred_workgroup_size_multiple = 0;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k; template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k; template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::norm_k;
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k; template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::axpy_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k; template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > OpenclKernels<Scalar>::scale_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k; template<class Scalar>
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k; std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::vmul_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k; template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > OpenclKernels<Scalar>::custom_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k; template<class Scalar>
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k; std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::full_to_pressure_restriction_k;
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k; template<class Scalar>
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k; std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels<Scalar>::add_coarse_pressure_correction_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k; template<class Scalar>
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k; std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::prolongate_vector_k;
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k; template<class Scalar>
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k; std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_k;
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k; template<class Scalar>
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_add_k;
template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_k;
template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_noreset_k;
template<class Scalar>
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels<Scalar>::residual_blocked_k;
template<class Scalar>
std::unique_ptr<residual_kernel_type> OpenclKernels<Scalar>::residual_k;
template<class Scalar>
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels<Scalar>::ILU_apply1_k;
template<class Scalar>
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels<Scalar>::ILU_apply2_k;
template<class Scalar>
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels<Scalar>::stdwell_apply_k;
template<class Scalar>
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels<Scalar>::ilu_decomp_k;
template<class Scalar>
std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
template<class Scalar>
std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;
// divide A by B, and round up: return (int)ceil(A/B) // divide A by B, and round up: return (int)ceil(A/B)
unsigned int ceilDivision(const unsigned int A, const unsigned int B) unsigned int ceilDivision(const unsigned int A, const unsigned int B)
@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
return A / B + (A % B > 0); return A / B + (A % B > 0);
} }
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) template<class Scalar>
void OpenclKernels<Scalar>::init(cl::Context *context,
cl::CommandQueue *queue_,
std::vector<cl::Device>& devices, int verbosity_)
{ {
if (initialized) { if (initialized) {
OpmLog::debug("Warning OpenclKernels is already initialized"); OpmLog::debug("Warning OpenclKernels is already initialized");
@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
// actually creating the kernels // actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1"))); dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm"))); norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy"))); axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale"))); scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int>(cl::Kernel(program, "scale")));
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul"))); vmul_k.reset(new cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom"))); custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int>(cl::Kernel(program, "custom")));
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction"))); full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction"))); add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector"))); prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
initialized = true; initialized = true;
} // end get_opencl_kernels() } // end get_opencl_kernels()
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) template<class Scalar>
Scalar OpenclKernels<Scalar>::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{ {
const unsigned int work_group_size = 256; const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_dot; Timer t_dot;
tmp.resize(num_work_groups); tmp.resize(num_work_groups);
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group)); cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data()); queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
double gpu_sum = 0.0; Scalar gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) { for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i]; gpu_sum += tmp[i];
} }
@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int
return gpu_sum; return gpu_sum;
} }
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N) template<class Scalar>
Scalar OpenclKernels<Scalar>::norm(cl::Buffer& in, cl::Buffer& out, int N)
{ {
const unsigned int work_group_size = 256; const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_norm; Timer t_norm;
tmp.resize(num_work_groups); tmp.resize(num_work_groups);
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group)); cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data()); queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
double gpu_norm = 0.0; Scalar gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) { for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i]; gpu_norm += tmp[i];
} }
@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
return gpu_norm; return gpu_norm;
} }
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N) template<class Scalar>
void OpenclKernels<Scalar>::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
} }
} }
void OpenclKernels::scale(cl::Buffer& in, const double a, int N) template<class Scalar>
void OpenclKernels<Scalar>::scale(cl::Buffer& in, const Scalar a, int N)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
} }
} }
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N) template<class Scalar>
void OpenclKernels<Scalar>::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c
} }
} }
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, template<class Scalar>
const double omega, const double beta, int N) void OpenclKernels<Scalar>::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
const Scalar omega, const Scalar beta, int N)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
} }
} }
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb) template<class Scalar>
void OpenclKernels<Scalar>::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B
} }
} }
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb) template<class Scalar>
void OpenclKernels<Scalar>::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
} }
} }
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N) template<class Scalar>
void OpenclKernels<Scalar>::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size); const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
} }
} }
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, template<class Scalar>
const cl::Buffer& x, cl::Buffer& b, int Nb, void OpenclKernels<Scalar>::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
unsigned int block_size, bool reset, bool add) const cl::Buffer& x, cl::Buffer& b, int Nb,
unsigned int block_size, bool reset, bool add)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_spmv; Timer t_spmv;
cl::Event event; cl::Event event;
if (block_size > 1) { if (block_size > 1) {
if (add) { if (add) {
event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
} else { } else {
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group)); vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
} }
} else { } else {
if (reset) { if (reset) {
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
} else { } else {
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group)); vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
} }
} }
@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
} }
} }
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, template<class Scalar>
cl::Buffer& x, const cl::Buffer& rhs, void OpenclKernels<Scalar>::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& out, int Nb, unsigned int block_size) cl::Buffer& x, const cl::Buffer& rhs,
cl::Buffer& out, int Nb, unsigned int block_size)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_residual; Timer t_residual;
cl::Event event; cl::Event event;
if (block_size > 1) { if (block_size > 1) {
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group)); vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
} else { } else {
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group)); vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
} }
if (verbosity >= 4) { if (verbosity >= 4) {
@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
} }
} }
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, template<class Scalar>
cl::Buffer& rows, cl::Buffer& diagIndex, void OpenclKernels<Scalar>::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& rowsPerColor, int color, const cl::Buffer& y, cl::Buffer& x,
int rowsThisColor, unsigned int block_size) cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
{ {
const unsigned int work_group_size = preferred_workgroup_size_multiple; const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = rowsThisColor; const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_ilu_apply1; Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
rowIndices, vals, cols, rows, diagIndex, rowIndices, vals, cols, rows, diagIndex,
y, x, rowsPerColor, color, block_size, y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group)); cl::Local(lmem_per_work_group));
if (verbosity >= 5) { if (verbosity >= 5) {
event.wait(); event.wait();
@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
} }
} }
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols, template<class Scalar>
cl::Buffer& rows, cl::Buffer& diagIndex, void OpenclKernels<Scalar>::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& rowsPerColor, int color, cl::Buffer& invDiagVals, cl::Buffer& x,
int rowsThisColor, unsigned int block_size) cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
{ {
const unsigned int work_group_size = preferred_workgroup_size_multiple; const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = rowsThisColor; const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size; const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_ilu_apply2; Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
rowIndices, vals, cols, rows, diagIndex, rowIndices, vals, cols, rows, diagIndex,
invDiagVals, x, rowsPerColor, color, block_size, invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group)); cl::Local(lmem_per_work_group));
if (verbosity >= 5) { if (verbosity >= 5) {
event.wait(); event.wait();
@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
} }
} }
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices, template<class Scalar>
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, void OpenclKernels<Scalar>::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
int rowsThisColor, unsigned int block_size) cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size)
{ {
const unsigned int work_group_size = 128; const unsigned int work_group_size = 128;
const unsigned int num_work_groups = rowsThisColor; const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size; const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int num_hwarps_per_group = work_group_size / 16; const unsigned int num_hwarps_per_group = work_group_size / 16;
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar); // each block needs a pivot
Timer t_ilu_decomp; Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
firstRow, lastRow, rowIndices, firstRow, lastRow, rowIndices,
vals, cols, rows, vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor, invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group)); cl::Local(lmem_per_work_group));
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
} }
} }
void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl, template<class Scalar>
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y, void OpenclKernels<Scalar>::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells) cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{ {
const unsigned int work_group_size = 32; const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size; const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size; const unsigned int lmem1 = sizeof(Scalar) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells; const unsigned int lmem2 = sizeof(Scalar) * dim_wells;
Timer t_apply_stdwells; Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl, d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2)); cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_
} }
} }
void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc, template<class Scalar>
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb) void OpenclKernels<Scalar>::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
{ {
const unsigned int work_group_size = 256; const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
Timer t_isaiL; Timer t_isaiL;
cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb); diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
} }
} }
void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping, template<class Scalar>
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, void OpenclKernels<Scalar>::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb) cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
{ {
const unsigned int work_group_size = 256; const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size); const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
Timer t_isaiU; Timer t_isaiU;
cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb); diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
if (verbosity >= 4) { if (verbosity >= 4) {
event.wait(); event.wait();
@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
} }
} }
} // namespace Accelerator template class OpenclKernels<double>;
} // namespace Opm
} // namespace Opm::Accelerator

View File

@ -26,10 +26,7 @@
#include <opm/simulators/linalg/bda/opencl/opencl.hpp> #include <opm/simulators/linalg/bda/opencl/opencl.hpp>
namespace Opm namespace Opm::Accelerator {
{
namespace Accelerator
{
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>; const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer
using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>; cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
template<class Scalar>
class OpenclKernels class OpenclKernels
{ {
private: private:
static int verbosity; static int verbosity;
static cl::CommandQueue *queue; static cl::CommandQueue *queue;
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm() static std::vector<Scalar> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized; static bool initialized;
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k; static std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > custom_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k; static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k; static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k; static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
@ -117,12 +115,12 @@ public:
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity); static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static double norm(cl::Buffer& in, cl::Buffer& out, int N); static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N); static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const double a, int N); static void scale(cl::Buffer& in, const Scalar a, int N);
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N); static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N); static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N);
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb); static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb); static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N); static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
@ -150,7 +148,40 @@ public:
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb); cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
}; };
} // namespace Accelerator #if CHOW_PATEL
} // namespace Opm #define DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::ILU_apply1_str; \
template<> const std::string OpenclKernels<T>::ILU_apply2_str;
#else
#define DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::ILU_apply1_fm_str; \
template<> const std::string OpenclKernels<T>::ILU_apply2_fm_str;
#endif
#define DECLARE_INSTANCE(T) \
DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::axpy_str; \
template<> const std::string OpenclKernels<T>::scale_str; \
template<> const std::string OpenclKernels<T>::vmul_str; \
template<> const std::string OpenclKernels<T>::dot_1_str; \
template<> const std::string OpenclKernels<T>::norm_str; \
template<> const std::string OpenclKernels<T>::custom_str; \
template<> const std::string OpenclKernels<T>::full_to_pressure_restriction_str; \
template<> const std::string OpenclKernels<T>::add_coarse_pressure_correction_str; \
template<> const std::string OpenclKernels<T>::prolongate_vector_str; \
template<> const std::string OpenclKernels<T>::spmv_blocked_str; \
template<> const std::string OpenclKernels<T>::spmv_blocked_add_str; \
template<> const std::string OpenclKernels<T>::spmv_str; \
template<> const std::string OpenclKernels<T>::spmv_noreset_str; \
template<> const std::string OpenclKernels<T>::residual_blocked_str; \
template<> const std::string OpenclKernels<T>::residual_str; \
template<> const std::string OpenclKernels<T>::stdwell_apply_str; \
template<> const std::string OpenclKernels<T>::ILU_decomp_str; \
template<> const std::string OpenclKernels<T>::isaiL_str; \
template<> const std::string OpenclKernels<T>::isaiU_str;
DECLARE_INSTANCE(double)
} // namespace Opm::Accelerator
#endif #endif

View File

@ -203,7 +203,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
context = std::make_shared<cl::Context>(devices[0]); context = std::make_shared<cl::Context>(devices[0]);
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err)); queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
OpenclKernels::init(context.get(), queue.get(), devices, verbosity); OpenclKernels<double>::init(context.get(), queue.get(), devices, verbosity);
} catch (const cl::Error& error) { } catch (const cl::Error& error) {
std::ostringstream oss; std::ostringstream oss;
@ -263,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error"); OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
} }
norm = OpenclKernels::norm(d_r, d_tmp, N); norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
norm_0 = norm; norm_0 = norm;
if (verbosity > 1) { if (verbosity > 1) {
@ -277,11 +277,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
} }
for (it = 0.5; it < maxit; it += 0.5) { for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho; rhop = rho;
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N); rho = OpenclKernels<double>::dot(d_rw, d_r, d_tmp, N);
if (it > 1) { if (it > 1) {
beta = (rho / rhop) * (alpha / omega); beta = (rho / rhop) * (alpha / omega);
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N); OpenclKernels<double>::custom(d_p, d_v, d_r, omega, beta, N);
} }
if (verbosity >= 3) { if (verbosity >= 3) {
queue->finish(); queue->finish();
@ -298,7 +298,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
} }
// v = A * pw // v = A * pw
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size); OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
if (verbosity >= 3) { if (verbosity >= 3) {
queue->finish(); queue->finish();
t_spmv.stop(); t_spmv.stop();
@ -315,11 +315,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start(); t_rest.start();
} }
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N); tmp1 = OpenclKernels<double>::dot(d_rw, d_v, d_tmp, N);
alpha = rho / tmp1; alpha = rho / tmp1;
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v OpenclKernels<double>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw OpenclKernels<double>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels::norm(d_r, d_tmp, N); norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
if (verbosity >= 3) { if (verbosity >= 3) {
queue->finish(); queue->finish();
t_rest.stop(); t_rest.stop();
@ -343,7 +343,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
} }
// t = A * s // t = A * s
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size); OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
if(verbosity >= 3){ if(verbosity >= 3){
queue->finish(); queue->finish();
t_spmv.stop(); t_spmv.stop();
@ -360,12 +360,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start(); t_rest.start();
} }
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N); tmp1 = OpenclKernels<double>::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N); tmp2 = OpenclKernels<double>::dot(d_t, d_t, d_tmp, N);
omega = tmp1 / tmp2; omega = tmp1 / tmp2;
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s OpenclKernels<double>::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t OpenclKernels<double>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels::norm(d_r, d_tmp, N); norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
if (verbosity >= 3) { if (verbosity >= 3) {
queue->finish(); queue->finish();
t_rest.stop(); t_rest.stop();

View File

@ -36,9 +36,12 @@ void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue*
} }
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){ void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y)
OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, {
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells); OpenclKernels<double>::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl,
*d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells,
*d_val_pointers_ocl, num_std_wells);
} }
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){ void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){