mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
OpenclKernels: template Scalar type
This commit is contained in:
parent
be59203179
commit
ba1c6db855
@ -31,7 +31,7 @@ endif()
|
||||
foreach(CL ${CL_LIST})
|
||||
get_filename_component(FNAME ${CL} NAME_WE)
|
||||
|
||||
file(APPEND ${CL_SRC_FILE} "const std::string OpenclKernels::${FNAME}_str = R\"\( \n")
|
||||
file(APPEND ${CL_SRC_FILE} "template<> const std::string OpenclKernels<double>::${FNAME}_str = R\"\( \n")
|
||||
file(READ "${CL}" CL_CONTENT)
|
||||
file(APPEND ${CL_SRC_FILE} "${CL_CONTENT}")
|
||||
file(APPEND ${CL_SRC_FILE} "\)\"; \n\n")
|
||||
|
@ -244,9 +244,9 @@ create_preconditioner(BlockedMatrix<double>* mat,
|
||||
if (verbosity >= 5) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.rowIndices,
|
||||
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
||||
s.invDiagVals, rowsPerColor[color], block_size);
|
||||
OpenclKernels<double>::ILU_decomp(firstRow, lastRow, s.rowIndices,
|
||||
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
||||
s.invDiagVals, rowsPerColor[color], block_size);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -272,30 +272,30 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<double>::ILU_apply1(s.rowIndices, s.Lvals, s.Lcols, s.Lrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<double>::ILU_apply1(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, y, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int color = numColors - 1; color >= 0; --color) {
|
||||
#if CHOW_PATEL
|
||||
OpenclKernels::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<double>::ILU_apply2(s.rowIndices, s.Uvals, s.Ucols, s.Urows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#else
|
||||
OpenclKernels::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
OpenclKernels<double>::ILU_apply2(s.rowIndices, s.LUvals, s.LUcols, s.LUrows,
|
||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||
color, rowsPerColor[color], block_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
// apply relaxation
|
||||
OpenclKernels::scale(x, relaxation, N);
|
||||
OpenclKernels<double>::scale(x, relaxation, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
std::ostringstream out;
|
||||
|
@ -263,8 +263,14 @@ create_preconditioner(BlockedMatrix<double>* mat,
|
||||
cl::WaitForEvents(events);
|
||||
events.clear();
|
||||
|
||||
OpenclKernels::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap, d_lower.subsystemPointers, d_lower.nzIndices, d_lower.unknownRhsIndices, d_lower.knownRhsIndices, d_LUvals, d_invLvals, Nb);
|
||||
OpenclKernels::isaiU(d_diagIndex, d_colPointers, d_rowIndices, d_csrToCscOffsetMap, d_upper.subsystemPointers, d_upper.nzIndices, d_upper.unknownRhsIndices, d_upper.knownRhsIndices, d_LUvals,
|
||||
OpenclKernels<double>::isaiL(d_diagIndex, d_colPointers, d_csrToCscOffsetMap,
|
||||
d_lower.subsystemPointers, d_lower.nzIndices,
|
||||
d_lower.unknownRhsIndices, d_lower.knownRhsIndices,
|
||||
d_LUvals, d_invLvals, Nb);
|
||||
OpenclKernels<double>::isaiU(d_diagIndex, d_colPointers, d_rowIndices,
|
||||
d_csrToCscOffsetMap, d_upper.subsystemPointers,
|
||||
d_upper.nzIndices, d_upper.unknownRhsIndices,
|
||||
d_upper.knownRhsIndices, d_LUvals,
|
||||
d_invDiagVals, d_invUvals, Nb);
|
||||
|
||||
if(verbosity >= 4){
|
||||
@ -286,10 +292,12 @@ template <unsigned int block_size>
|
||||
void BISAI<block_size>::apply(const cl::Buffer& x, cl::Buffer& y){
|
||||
const unsigned int bs = block_size;
|
||||
|
||||
OpenclKernels::spmv(d_invLvals, d_rowIndices, d_colPointers, x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
|
||||
// (to compensate for the unitary diagonal that is not
|
||||
// included in isaiL, for simplicity)
|
||||
OpenclKernels::spmv(d_invUvals, d_rowIndices, d_colPointers, d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
|
||||
OpenclKernels<double>::spmv(d_invLvals, d_rowIndices, d_colPointers,
|
||||
x, d_invL_x, Nb, bs, true, true); // application of isaiL is a simple spmv with addition
|
||||
// (to compensate for the unitary diagonal that is not
|
||||
// included in isaiL, for simplicity)
|
||||
OpenclKernels<double>::spmv(d_invUvals, d_rowIndices, d_colPointers,
|
||||
d_invL_x, y, Nb, bs); // application of isaiU is a simple spmv
|
||||
}
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
|
@ -494,20 +494,21 @@ void CPR<block_size>::amg_cycle_gpu(const int level, cl::Buffer& y, cl::Buffer&
|
||||
// presmooth
|
||||
double jacobi_damping = 0.65; // default value in amgcl: 0.72
|
||||
for (unsigned i = 0; i < num_pre_smooth_steps; ++i){
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
|
||||
// move to coarser level
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels<double>::spmv(R->nnzValues, R->colIndices, R->rowPointers, t, f, Nnext, 1, true);
|
||||
amg_cycle_gpu(level + 1, f, u);
|
||||
OpenclKernels::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
OpenclKernels<double>::prolongate_vector(u, x, d_PcolIndices[level], Ncur);
|
||||
|
||||
// postsmooth
|
||||
for (unsigned i = 0; i < num_post_smooth_steps; ++i){
|
||||
OpenclKernels::residual(A->nnzValues, A->colIndices, A->rowPointers, x, y, t, Ncur, 1);
|
||||
OpenclKernels::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
OpenclKernels<double>::residual(A->nnzValues, A->colIndices, A->rowPointers,
|
||||
x, y, t, Ncur, 1);
|
||||
OpenclKernels<double>::vmul(jacobi_damping, d_invDiags[level], t, x, Ncur);
|
||||
}
|
||||
}
|
||||
|
||||
@ -528,12 +529,13 @@ void CPR<block_size>::apply_amg(const cl::Buffer& y, cl::Buffer& x) {
|
||||
OPM_THROW(std::logic_error, "CPR OpenCL enqueueWriteBuffer error");
|
||||
}
|
||||
|
||||
OpenclKernels::residual(d_mat->nnzValues, d_mat->colIndices, d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
OpenclKernels<double>::residual(d_mat->nnzValues, d_mat->colIndices,
|
||||
d_mat->rowPointers, x, y, *d_rs, Nb, block_size);
|
||||
OpenclKernels<double>::full_to_pressure_restriction(*d_rs, *d_weights, *d_coarse_y, Nb);
|
||||
|
||||
amg_cycle_gpu(0, *d_coarse_y, *d_coarse_x);
|
||||
|
||||
OpenclKernels::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
|
||||
OpenclKernels<double>::add_coarse_pressure_correction(*d_coarse_x, x, pressure_idx, Nb);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
|
@ -18,52 +18,71 @@
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/opencl/ChowPatelIlu.hpp> // defines CHOW_PATEL
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
// define static variables and kernels
|
||||
int OpenclKernels::verbosity;
|
||||
cl::CommandQueue *OpenclKernels::queue;
|
||||
std::vector<double> OpenclKernels::tmp;
|
||||
bool OpenclKernels::initialized = false;
|
||||
std::size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
|
||||
template<class Scalar> int OpenclKernels<Scalar>::verbosity;
|
||||
template<class Scalar> cl::CommandQueue* OpenclKernels<Scalar>::queue;
|
||||
template<class Scalar> std::vector<Scalar> OpenclKernels<Scalar>::tmp;
|
||||
template<class Scalar> bool OpenclKernels<Scalar>::initialized = false;
|
||||
template<class Scalar> std::size_t OpenclKernels<Scalar>::preferred_workgroup_size_multiple = 0;
|
||||
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::vmul_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels::full_to_pressure_restriction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels::add_coarse_pressure_correction_k;
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels::prolongate_vector_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels::spmv_blocked_add_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_noreset_k;
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels::residual_blocked_k;
|
||||
std::unique_ptr<residual_kernel_type> OpenclKernels::residual_k;
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
|
||||
std::unique_ptr<isaiL_kernel_type> OpenclKernels::isaiL_k;
|
||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels::isaiU_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::dot_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels<Scalar>::norm_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::axpy_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > OpenclKernels<Scalar>::scale_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::vmul_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > OpenclKernels<Scalar>::custom_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::full_to_pressure_restriction_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > OpenclKernels<Scalar>::add_coarse_pressure_correction_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > OpenclKernels<Scalar>::prolongate_vector_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_blocked_kernel_type> OpenclKernels<Scalar>::spmv_blocked_add_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels<Scalar>::spmv_noreset_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<residual_blocked_kernel_type> OpenclKernels<Scalar>::residual_blocked_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<residual_kernel_type> OpenclKernels<Scalar>::residual_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels<Scalar>::ILU_apply1_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels<Scalar>::ILU_apply2_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels<Scalar>::stdwell_apply_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels<Scalar>::ilu_decomp_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<isaiL_kernel_type> OpenclKernels<Scalar>::isaiL_k;
|
||||
template<class Scalar>
|
||||
std::unique_ptr<isaiU_kernel_type> OpenclKernels<Scalar>::isaiU_k;
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
@ -71,7 +90,10 @@ unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::init(cl::Context *context,
|
||||
cl::CommandQueue *queue_,
|
||||
std::vector<cl::Device>& devices, int verbosity_)
|
||||
{
|
||||
if (initialized) {
|
||||
OpmLog::debug("Warning OpenclKernels is already initialized");
|
||||
@ -118,10 +140,10 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
// actually creating the kernels
|
||||
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
|
||||
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
vmul_k.reset(new cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
vmul_k.reset(new cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "vmul")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
full_to_pressure_restriction_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>(cl::Kernel(program, "full_to_pressure_restriction")));
|
||||
add_coarse_pressure_correction_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int>(cl::Kernel(program, "add_coarse_pressure_correction")));
|
||||
prolongate_vector_k.reset(new cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int>(cl::Kernel(program, "prolongate_vector")));
|
||||
@ -146,20 +168,21 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
||||
initialized = true;
|
||||
} // end get_opencl_kernels()
|
||||
|
||||
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
Scalar OpenclKernels<Scalar>::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_dot;
|
||||
tmp.resize(num_work_groups);
|
||||
|
||||
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
|
||||
|
||||
double gpu_sum = 0.0;
|
||||
Scalar gpu_sum = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_sum += tmp[i];
|
||||
}
|
||||
@ -174,20 +197,21 @@ double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int
|
||||
return gpu_sum;
|
||||
}
|
||||
|
||||
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
Scalar OpenclKernels<Scalar>::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_norm;
|
||||
tmp.resize(num_work_groups);
|
||||
|
||||
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(Scalar) * num_work_groups, tmp.data());
|
||||
|
||||
double gpu_norm = 0.0;
|
||||
Scalar gpu_norm = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_norm += tmp[i];
|
||||
}
|
||||
@ -203,7 +227,8 @@ double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
return gpu_norm;
|
||||
}
|
||||
|
||||
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -220,7 +245,8 @@ void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::scale(cl::Buffer& in, const Scalar a, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -237,7 +263,8 @@ void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -254,8 +281,9 @@ void OpenclKernels::vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, c
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
|
||||
const double omega, const double beta, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
|
||||
const Scalar omega, const Scalar beta, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -272,7 +300,8 @@ void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r,
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -289,7 +318,8 @@ void OpenclKernels::full_to_pressure_restriction(const cl::Buffer& fine_y, cl::B
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -306,7 +336,8 @@ void OpenclKernels::add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buf
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
@ -323,32 +354,33 @@ void OpenclKernels::prolongate_vector(const cl::Buffer& in, cl::Buffer& out, con
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
const cl::Buffer& x, cl::Buffer& b, int Nb,
|
||||
unsigned int block_size, bool reset, bool add)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
const cl::Buffer& x, cl::Buffer& b, int Nb,
|
||||
unsigned int block_size, bool reset, bool add)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_spmv;
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
if (add) {
|
||||
event = (*spmv_blocked_add_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
} else {
|
||||
if (reset) {
|
||||
event = (*spmv_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*spmv_noreset_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, b, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
}
|
||||
|
||||
@ -360,23 +392,24 @@ void OpenclKernels::spmv(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& x, const cl::Buffer& rhs,
|
||||
cl::Buffer& out, int Nb, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& x, const cl::Buffer& rhs,
|
||||
cl::Buffer& out, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_residual;
|
||||
cl::Event event;
|
||||
|
||||
if (block_size > 1) {
|
||||
event = (*residual_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, rhs, out, block_size, cl::Local(lmem_per_work_group));
|
||||
} else {
|
||||
event = (*residual_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
|
||||
vals, cols, rows, Nb, x, rhs, out, cl::Local(lmem_per_work_group));
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
@ -387,22 +420,23 @@ void OpenclKernels::residual(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& row
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
const cl::Buffer& y, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = preferred_workgroup_size_multiple;
|
||||
const unsigned int num_work_groups = rowsThisColor;
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_ilu_apply1;
|
||||
|
||||
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
y, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
y, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
event.wait();
|
||||
@ -412,22 +446,23 @@ void OpenclKernels::ILU_apply1(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buffer& cols,
|
||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& x,
|
||||
cl::Buffer& rowsPerColor, int color,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = preferred_workgroup_size_multiple;
|
||||
const unsigned int num_work_groups = rowsThisColor;
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(Scalar) * work_group_size;
|
||||
Timer t_ilu_apply2;
|
||||
|
||||
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
invDiagVals, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
rowIndices, vals, cols, rows, diagIndex,
|
||||
invDiagVals, x, rowsPerColor, color, block_size,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
event.wait();
|
||||
@ -437,23 +472,24 @@ void OpenclKernels::ILU_apply2(cl::Buffer& rowIndices, cl::Buffer& vals, cl::Buf
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
|
||||
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices,
|
||||
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
||||
int rowsThisColor, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 128;
|
||||
const unsigned int num_work_groups = rowsThisColor;
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int num_hwarps_per_group = work_group_size / 16;
|
||||
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
|
||||
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(Scalar); // each block needs a pivot
|
||||
Timer t_ilu_decomp;
|
||||
|
||||
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
firstRow, lastRow, rowIndices,
|
||||
vals, cols, rows,
|
||||
invDiagVals, diagIndex, rowsThisColor,
|
||||
cl::Local(lmem_per_work_group));
|
||||
firstRow, lastRow, rowIndices,
|
||||
vals, cols, rows,
|
||||
invDiagVals, diagIndex, rowsThisColor,
|
||||
cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -463,19 +499,20 @@ void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& rowIndices
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_ocl, cl::Buffer &d_Bnnzs_ocl,
|
||||
cl::Buffer &d_Ccols_ocl, cl::Buffer &d_Bcols_ocl, cl::Buffer &d_x, cl::Buffer &d_y,
|
||||
int dim, int dim_wells, cl::Buffer &d_val_pointers_ocl, int num_std_wells)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int total_work_items = num_std_wells * work_group_size;
|
||||
const unsigned int lmem1 = sizeof(double) * work_group_size;
|
||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||
const unsigned int lmem1 = sizeof(Scalar) * work_group_size;
|
||||
const unsigned int lmem2 = sizeof(Scalar) * dim_wells;
|
||||
Timer t_apply_stdwells;
|
||||
|
||||
cl::Event event = (*stdwell_apply_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
|
||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells, d_val_pointers_ocl,
|
||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -485,8 +522,9 @@ void OpenclKernels::apply_stdwells(cl::Buffer& d_Cnnzs_ocl, cl::Buffer &d_Dnnzs_
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
|
||||
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& mapping, cl::Buffer& nvc,
|
||||
cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals, cl::Buffer& invLvals, unsigned int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -494,7 +532,7 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
|
||||
Timer t_isaiL;
|
||||
cl::Event event = (*isaiL_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
|
||||
diagIndex, colPointers, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invLvals, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -504,9 +542,10 @@ void OpenclKernels::isaiL(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
|
||||
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
|
||||
template<class Scalar>
|
||||
void OpenclKernels<Scalar>::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Buffer& rowIndices, cl::Buffer& mapping,
|
||||
cl::Buffer& nvc, cl::Buffer& luIdxs, cl::Buffer& xxIdxs, cl::Buffer& dxIdxs, cl::Buffer& LUvals,
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
@ -514,7 +553,7 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
|
||||
Timer t_isaiU;
|
||||
cl::Event event = (*isaiU_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
|
||||
diagIndex, colPointers, rowIndices, mapping, nvc, luIdxs, xxIdxs, dxIdxs, LUvals, invDiagVals, invUvals, Nb);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
@ -524,5 +563,6 @@ void OpenclKernels::isaiU(cl::Buffer& diagIndex, cl::Buffer& colPointers, cl::Bu
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
template class OpenclKernels<double>;
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
@ -26,10 +26,7 @@
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl/opencl.hpp>
|
||||
|
||||
namespace Opm
|
||||
{
|
||||
namespace Accelerator
|
||||
{
|
||||
namespace Opm::Accelerator {
|
||||
|
||||
using spmv_blocked_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
const cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
@ -54,21 +51,22 @@ using isaiL_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer
|
||||
using isaiU_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int>;
|
||||
|
||||
template<class Scalar>
|
||||
class OpenclKernels
|
||||
{
|
||||
private:
|
||||
static int verbosity;
|
||||
static cl::CommandQueue *queue;
|
||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static std::vector<Scalar> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static bool initialized;
|
||||
static std::size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
|
||||
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const double, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const Scalar, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const Scalar, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > vmul_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const Scalar, const Scalar, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int> > full_to_pressure_restriction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int> > add_coarse_pressure_correction_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<const cl::Buffer&, cl::Buffer&, const cl::Buffer&, const unsigned int> > prolongate_vector_k;
|
||||
@ -117,12 +115,12 @@ public:
|
||||
|
||||
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
|
||||
|
||||
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const double a, int N);
|
||||
static void vmul(const double alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
|
||||
static Scalar dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static Scalar norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const Scalar a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const Scalar a, int N);
|
||||
static void vmul(const Scalar alpha, cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const Scalar omega, const Scalar beta, int N);
|
||||
static void full_to_pressure_restriction(const cl::Buffer& fine_y, cl::Buffer& weights, cl::Buffer& coarse_y, int Nb);
|
||||
static void add_coarse_pressure_correction(cl::Buffer& coarse_x, cl::Buffer& fine_x, int pressure_idx, int Nb);
|
||||
static void prolongate_vector(const cl::Buffer& in, cl::Buffer& out, const cl::Buffer& cols, int N);
|
||||
@ -150,7 +148,40 @@ public:
|
||||
cl::Buffer& invDiagVals, cl::Buffer& invUvals, unsigned int Nb);
|
||||
};
|
||||
|
||||
} // namespace Accelerator
|
||||
} // namespace Opm
|
||||
#if CHOW_PATEL
|
||||
#define DECLARE_ILU(T) \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply1_str; \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply2_str;
|
||||
#else
|
||||
#define DECLARE_ILU(T) \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply1_fm_str; \
|
||||
template<> const std::string OpenclKernels<T>::ILU_apply2_fm_str;
|
||||
#endif
|
||||
|
||||
#define DECLARE_INSTANCE(T) \
|
||||
DECLARE_ILU(T) \
|
||||
template<> const std::string OpenclKernels<T>::axpy_str; \
|
||||
template<> const std::string OpenclKernels<T>::scale_str; \
|
||||
template<> const std::string OpenclKernels<T>::vmul_str; \
|
||||
template<> const std::string OpenclKernels<T>::dot_1_str; \
|
||||
template<> const std::string OpenclKernels<T>::norm_str; \
|
||||
template<> const std::string OpenclKernels<T>::custom_str; \
|
||||
template<> const std::string OpenclKernels<T>::full_to_pressure_restriction_str; \
|
||||
template<> const std::string OpenclKernels<T>::add_coarse_pressure_correction_str; \
|
||||
template<> const std::string OpenclKernels<T>::prolongate_vector_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_blocked_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_blocked_add_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_str; \
|
||||
template<> const std::string OpenclKernels<T>::spmv_noreset_str; \
|
||||
template<> const std::string OpenclKernels<T>::residual_blocked_str; \
|
||||
template<> const std::string OpenclKernels<T>::residual_str; \
|
||||
template<> const std::string OpenclKernels<T>::stdwell_apply_str; \
|
||||
template<> const std::string OpenclKernels<T>::ILU_decomp_str; \
|
||||
template<> const std::string OpenclKernels<T>::isaiL_str; \
|
||||
template<> const std::string OpenclKernels<T>::isaiU_str;
|
||||
|
||||
DECLARE_INSTANCE(double)
|
||||
|
||||
} // namespace Opm::Accelerator
|
||||
|
||||
#endif
|
||||
|
@ -203,7 +203,7 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
context = std::make_shared<cl::Context>(devices[0]);
|
||||
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
|
||||
|
||||
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
|
||||
OpenclKernels<double>::init(context.get(), queue.get(), devices, verbosity);
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
@ -263,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
|
||||
}
|
||||
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
|
||||
norm_0 = norm;
|
||||
|
||||
if (verbosity > 1) {
|
||||
@ -277,11 +277,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
for (it = 0.5; it < maxit; it += 0.5) {
|
||||
rhop = rho;
|
||||
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
|
||||
rho = OpenclKernels<double>::dot(d_rw, d_r, d_tmp, N);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho / rhop) * (alpha / omega);
|
||||
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
OpenclKernels<double>::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
}
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
@ -298,7 +298,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
|
||||
// v = A * pw
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_spmv.stop();
|
||||
@ -315,11 +315,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
t_rest.start();
|
||||
}
|
||||
|
||||
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
|
||||
tmp1 = OpenclKernels<double>::dot(d_rw, d_v, d_tmp, N);
|
||||
alpha = rho / tmp1;
|
||||
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
OpenclKernels<double>::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels<double>::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_rest.stop();
|
||||
@ -343,7 +343,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
}
|
||||
|
||||
// t = A * s
|
||||
OpenclKernels::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
OpenclKernels<double>::spmv(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
if(verbosity >= 3){
|
||||
queue->finish();
|
||||
t_spmv.stop();
|
||||
@ -360,12 +360,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
t_rest.start();
|
||||
}
|
||||
|
||||
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
|
||||
tmp1 = OpenclKernels<double>::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels<double>::dot(d_t, d_t, d_tmp, N);
|
||||
omega = tmp1 / tmp2;
|
||||
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
OpenclKernels<double>::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels<double>::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels<double>::norm(d_r, d_tmp, N);
|
||||
if (verbosity >= 3) {
|
||||
queue->finish();
|
||||
t_rest.stop();
|
||||
|
@ -36,9 +36,12 @@ void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue*
|
||||
}
|
||||
|
||||
|
||||
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y){
|
||||
OpenclKernels::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
|
||||
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
|
||||
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y)
|
||||
{
|
||||
OpenclKernels<double>::apply_stdwells(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl,
|
||||
*d_Ccols_ocl, *d_Bcols_ocl,
|
||||
d_x, d_y, dim, dim_wells,
|
||||
*d_val_pointers_ocl, num_std_wells);
|
||||
}
|
||||
|
||||
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
||||
|
Loading…
Reference in New Issue
Block a user