OpenclKernels: template Scalar type

This commit is contained in:
Arne Morten Kvarving 2024-04-15 16:45:13 +02:00
parent be59203179
commit ba1c6db855
8 changed files with 263 additions and 179 deletions

View File

@ -31,7 +31,7 @@ endif()
foreach(CL ${CL_LIST})
get_filename_component(FNAME ${CL} NAME_WE)
file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels<double>::${FNAME}_str = R\"\( \n")
file(READ "${CL}" CL_CONTENT)
file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")

View File

@ -244,9 +244,9 @@ create_preconditioner(BlockedMatrix<double>* mat,
if (verbosity >= 5) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size);
OpenclKernels<double>::ILU_decomp(firstRow, lastRow, s.rowIndices,
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
s.invDiagVals, rowsPerColor[color], block_size);
}
if (verbosity >= 3) {
@ -272,30 +272,30 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
for (int color = 0; color < numColors; ++color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<double>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#else
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<double>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, y, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#endif
}
for (int color = numColors - 1; color >= 0; --color) {
#if CHOW_PATEL
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<double>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#else
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
OpenclKernels<double>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
color, rowsPerColor[color], block_size);
#endif
}
// apply relaxation
OpenclKernels::scale(x, relaxation, N);
OpenclKernels<double>::scale(x, relaxation, N);
if (verbosity >= 4) {
std::ostringstream out;

View File

@ -263,8 +263,14 @@ create_preconditioner(BlockedMatrix<double>* mat,
cl::WaitForEvents(events);
events.clear();
OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
OpenclKernels<double>::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap,
d_lower.subsystemPointers, d_lower.nzIndices,
d_lower.unknownRhsIndices, d_lower.knownRhsIndices,
d_LUvals, d_invLvals, Nb);
OpenclKernels<double>::isaiU(d_diagIndex, d_colPointers, d_rowIndices,
d_csrToCscOffsetMap, d_upper.subsystemPointers,
d_upper.nzIndices, d_upper.unknownRhsIndices,
d_upper.knownRhsIndices, d_LUvals,
d_invDiagVals, d_invUvals, Nb);
if(verbosity >= 4){
@ -286,10 +292,12 @@ template <unsigned int block_size>
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
const unsigned int bs = block_size;
OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
// (to compensate for the unitary diagonal that is not
// included in isaiL, for simplicity)
OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
OpenclKernels<double>::spmv(d_invLvals, d_rowIndices, d_colPointers,
x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
// (to compensate for the unitary diagonal that is not
// included in isaiL, for simplicity)
OpenclKernels<double>::spmv(d_invUvals, d_rowIndices, d_colPointers,
d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \

View File

@ -494,20 +494,21 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer&
// presmooth
double jacobi_damping = 0.65; // default value in amgcl: 0.72
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
// move to coarser level
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels<double>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
amg_cycle_gpu(level + 1, f, u);
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
OpenclKernels<double>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
// postsmooth
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers,
x, y, t, Ncur, 1);
OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
}
}
@ -528,12 +529,13 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
}
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
OpenclKernels<double>::residual(d_mat->nnzValues, d_mat->colIndices,
d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
OpenclKernels<double>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
OpenclKernels<double>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
}
template <unsigned int block_size>

View File

@ -18,52 +18,71 @@
*/
#include <config.h>
#include <cmath>
#include <sstream>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL
namespace Opm
{
namespace Accelerator
{
#include <cmath>
#include <sstream>
namespace Opm::Accelerator {
using Opm::OpmLog;
using Dune::Timer;
// define static variables and kernels
int OpenclKernels::verbosity;
cl::CommandQueue *OpenclKernels::queue;
std::vector<double> OpenclKernels::tmp;
bool OpenclKernels::initialized = false;
std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
template<class Scalar> int OpenclKernels<Scalar>::verbosity;
template<class Scalar> cl::CommandQueue* OpenclKernels<Scalar>::queue;
template<class Scalar> std::vector<Scalar> OpenclKernels<Scalar>::tmp;
template<class Scalar> bool OpenclKernels<Scalar>::initialized = false;
template<class Scalar> std::size_t OpenclKernels<Scalar>::preferred_workgroup_size_multiple = 0;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::dot_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::norm_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::axpy_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > OpenclKernels<Scalar>::scale_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::vmul_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > OpenclKernels<Scalar>::custom_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::full_to_pressure_restriction_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels<Scalar>::add_coarse_pressure_correction_k;
template<class Scalar>
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::prolongate_vector_k;
template<class Scalar>
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_k;
template<class Scalar>
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_add_k;
template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_k;
template<class Scalar>
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_noreset_k;
template<class Scalar>
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels<Scalar>::residual_blocked_k;
template<class Scalar>
std::unique_ptr<residual_kernel_type> OpenclKernels<Scalar>::residual_k;
template<class Scalar>
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels<Scalar>::ILU_apply1_k;
template<class Scalar>
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels<Scalar>::ILU_apply2_k;
template<class Scalar>
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels<Scalar>::stdwell_apply_k;
template<class Scalar>
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels<Scalar>::ilu_decomp_k;
template<class Scalar>
std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
template<class Scalar>
std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;
// divide A by B, and round up: return (int)ceil(A/B)
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
return A / B + (A % B > 0);
}
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
template<class Scalar>
void OpenclKernels<Scalar>::init(cl::Context *context,
cl::CommandQueue *queue_,
std::vector<cl::Device>& devices, int verbosity_)
{
if (initialized) {
OpmLog::debug("Warning OpenclKernels is already initialized");
@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
// actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int>(cl::Kernel(program, "scale")));
vmul_k.reset(new cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int>(cl::Kernel(program, "custom")));
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
initialized = true;
} // end get_opencl_kernels()
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
template<class Scalar>
Scalar OpenclKernels<Scalar>::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_dot;
tmp.resize(num_work_groups);
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
double gpu_sum = 0.0;
Scalar gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i];
}
@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int
return gpu_sum;
}
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
template<class Scalar>
Scalar OpenclKernels<Scalar>::norm(cl::Buffer& in, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_norm;
tmp.resize(num_work_groups);
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
double gpu_norm = 0.0;
Scalar gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i];
}
@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
return gpu_norm;
}
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
template<class Scalar>
void OpenclKernels<Scalar>::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
}
}
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
template<class Scalar>
void OpenclKernels<Scalar>::scale(cl::Buffer& in, const Scalar a, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
}
}
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
template<class Scalar>
void OpenclKernels<Scalar>::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c
}
}
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
const double omega, const double beta, int N)
template<class Scalar>
void OpenclKernels<Scalar>::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
const Scalar omega, const Scalar beta, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
}
}
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B
}
}
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
}
}
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
template<class Scalar>
void OpenclKernels<Scalar>::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
}
}
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
const cl::Buffer& x, cl::Buffer& b, int Nb,
unsigned int block_size, bool reset, bool add)
template<class Scalar>
void OpenclKernels<Scalar>::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
const cl::Buffer& x, cl::Buffer& b, int Nb,
unsigned int block_size, bool reset, bool add)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_spmv;
cl::Event event;
if (block_size > 1) {
if (add) {
event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
} else {
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
}
} else {
if (reset) {
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
} else {
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
}
}
@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
}
}
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& x, const cl::Buffer& rhs,
cl::Buffer& out, int Nb, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& x, const cl::Buffer& rhs,
cl::Buffer& out, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_residual;
cl::Event event;
if (block_size > 1) {
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
} else {
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
}
if (verbosity >= 4) {
@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
}
}
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
const cl::Buffer& y, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
{
const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
rowIndices, vals, cols, rows, diagIndex,
y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
rowIndices, vals, cols, rows, diagIndex,
y, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
}
}
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
cl::Buffer& rows, cl::Buffer& diagIndex,
cl::Buffer& invDiagVals, cl::Buffer& x,
cl::Buffer& rowsPerColor, int color,
int rowsThisColor, unsigned int block_size)
{
const unsigned int work_group_size = preferred_workgroup_size_multiple;
const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
rowIndices, vals, cols, rows, diagIndex,
invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
rowIndices, vals, cols, rows, diagIndex,
invDiagVals, x, rowsPerColor, color, block_size,
cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
}
}
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size)
template<class Scalar>
void OpenclKernels<Scalar>::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
int rowsThisColor, unsigned int block_size)
{
const unsigned int work_group_size = 128;
const unsigned int num_work_groups = rowsThisColor;
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int num_hwarps_per_group = work_group_size / 16;
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar); // each block needs a pivot
Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
firstRow, lastRow, rowIndices,
vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group));
firstRow, lastRow, rowIndices,
vals, cols, rows,
invDiagVals, diagIndex, rowsThisColor,
cl::Local(lmem_per_work_group));
if (verbosity >= 4) {
event.wait();
@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
}
}
void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
template<class Scalar>
void OpenclKernels<Scalar>::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
{
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
const unsigned int lmem1 = sizeof(Scalar) * work_group_size;
const unsigned int lmem2 = sizeof(Scalar) * dim_wells;
Timer t_apply_stdwells;
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (verbosity >= 4) {
event.wait();
@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_
}
}
void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
Timer t_isaiL;
cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
if (verbosity >= 4) {
event.wait();
@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
}
}
void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
template<class Scalar>
void OpenclKernels<Scalar>::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
Timer t_isaiU;
cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
if (verbosity >= 4) {
event.wait();
@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
}
}
} // namespace Accelerator
} // namespace Opm
template class OpenclKernels<double>;
} // namespace Opm::Accelerator

View File

@ -26,10 +26,7 @@
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
namespace Opm
{
namespace Accelerator
{
namespace Opm::Accelerator {
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer
using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
template<class Scalar>
class OpenclKernels
{
private:
static int verbosity;
static cl::CommandQueue *queue;
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
static std::vector<Scalar> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized;
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > custom_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
@ -117,12 +115,12 @@ public:
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const double a, int N);
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const Scalar a, int N);
static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N);
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
@ -150,7 +148,40 @@ public:
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
};
} // namespace Accelerator
} // namespace Opm
#if CHOW_PATEL
#define DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::ILU_apply1_str; \
template<> const std::string OpenclKernels<T>::ILU_apply2_str;
#else
#define DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::ILU_apply1_fm_str; \
template<> const std::string OpenclKernels<T>::ILU_apply2_fm_str;
#endif
#define DECLARE_INSTANCE(T) \
DECLARE_ILU(T) \
template<> const std::string OpenclKernels<T>::axpy_str; \
template<> const std::string OpenclKernels<T>::scale_str; \
template<> const std::string OpenclKernels<T>::vmul_str; \
template<> const std::string OpenclKernels<T>::dot_1_str; \
template<> const std::string OpenclKernels<T>::norm_str; \
template<> const std::string OpenclKernels<T>::custom_str; \
template<> const std::string OpenclKernels<T>::full_to_pressure_restriction_str; \
template<> const std::string OpenclKernels<T>::add_coarse_pressure_correction_str; \
template<> const std::string OpenclKernels<T>::prolongate_vector_str; \
template<> const std::string OpenclKernels<T>::spmv_blocked_str; \
template<> const std::string OpenclKernels<T>::spmv_blocked_add_str; \
template<> const std::string OpenclKernels<T>::spmv_str; \
template<> const std::string OpenclKernels<T>::spmv_noreset_str; \
template<> const std::string OpenclKernels<T>::residual_blocked_str; \
template<> const std::string OpenclKernels<T>::residual_str; \
template<> const std::string OpenclKernels<T>::stdwell_apply_str; \
template<> const std::string OpenclKernels<T>::ILU_decomp_str; \
template<> const std::string OpenclKernels<T>::isaiL_str; \
template<> const std::string OpenclKernels<T>::isaiU_str;
DECLARE_INSTANCE(double)
} // namespace Opm::Accelerator
#endif

View File

@ -203,7 +203,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
context = std::make_shared<cl::Context>(devices[0]);
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
OpenclKernels<double>::init(context.get(), queue.get(), devices, verbosity);
} catch (const cl::Error& error) {
std::ostringstream oss;
@ -263,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = OpenclKernels::norm(d_r, d_tmp, N);
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
norm_0 = norm;
if (verbosity > 1) {
@ -277,11 +277,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho;
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
rho = OpenclKernels<double>::dot(d_rw, d_r, d_tmp, N);
if (it > 1) {
beta = (rho / rhop) * (alpha / omega);
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
OpenclKernels<double>::custom(d_p, d_v, d_r, omega, beta, N);
}
if (verbosity >= 3) {
queue->finish();
@ -298,7 +298,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
// v = A * pw
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
if (verbosity >= 3) {
queue->finish();
t_spmv.stop();
@ -315,11 +315,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start();
}
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
tmp1 = OpenclKernels<double>::dot(d_rw, d_v, d_tmp, N);
alpha = rho / tmp1;
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels::norm(d_r, d_tmp, N);
OpenclKernels<double>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels<double>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
if (verbosity >= 3) {
queue->finish();
t_rest.stop();
@ -343,7 +343,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
}
// t = A * s
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
if(verbosity >= 3){
queue->finish();
t_spmv.stop();
@ -360,12 +360,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start();
}
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
tmp1 = OpenclKernels<double>::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels<double>::dot(d_t, d_t, d_tmp, N);
omega = tmp1 / tmp2;
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels::norm(d_r, d_tmp, N);
OpenclKernels<double>::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels<double>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
if (verbosity >= 3) {
queue->finish();
t_rest.stop();

View File

@ -36,9 +36,12 @@ void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue*
}
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y)
{
OpenclKernels<double>::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl,
*d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells,
*d_val_pointers_ocl, num_std_wells);
}
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){