mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-01-02 04:26:55 -06:00
Move opencl kernels to static class
This commit is contained in:
parent
b8587daaa7
commit
1583292bf4
@ -241,16 +241,10 @@ BILU0<block_size>::~BILU0()
|
||||
for (int color = 0; color < numColors; ++color) {
|
||||
const unsigned int firstRow = rowsPerColorPrefix[color];
|
||||
const unsigned int lastRow = rowsPerColorPrefix[color+1];
|
||||
const unsigned int work_group_size2 = 128;
|
||||
const unsigned int num_work_groups2 = 1024;
|
||||
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
|
||||
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
|
||||
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot
|
||||
if (verbosity >= 4) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
|
||||
event.wait();
|
||||
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
|
||||
}
|
||||
|
||||
if (verbosity >= 3) {
|
||||
@ -266,7 +260,7 @@ BILU0<block_size>::~BILU0()
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::apply(cl::Buffer& y, cl::Buffer& x)
|
||||
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
const double relaxation = 0.9;
|
||||
cl::Event event;
|
||||
@ -274,28 +268,24 @@ BILU0<block_size>::~BILU0()
|
||||
|
||||
for(int color = 0; color < numColors; ++color){
|
||||
#if CHOW_PATEL
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#else
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#endif
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
for(int color = numColors-1; color >= 0; --color){
|
||||
#if CHOW_PATEL
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#else
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
|
||||
#endif
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
// apply relaxation
|
||||
event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N);
|
||||
event.wait();
|
||||
OpenclKernels::scale(x, relaxation, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream out;
|
||||
out << "BILU0 apply: " << t_apply.stop() << " s";
|
||||
OpmLog::info(out.str());
|
||||
@ -311,41 +301,17 @@ template <unsigned int block_size>
|
||||
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
|
||||
this->queue = queue_;
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) {
|
||||
this->work_group_size = work_group_size_;
|
||||
this->total_work_items = total_work_items_;
|
||||
this->lmem_per_work_group = lmem_per_work_group_;
|
||||
}
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernels(
|
||||
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
|
||||
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
|
||||
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale_,
|
||||
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_
|
||||
){
|
||||
this->ILU_apply1 = ILU_apply1_;
|
||||
this->ILU_apply2 = ILU_apply2_;
|
||||
this->scale = scale_;
|
||||
this->ilu_decomp = ilu_decomp_;
|
||||
}
|
||||
|
||||
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template BILU0<n>::BILU0(ILUReorder, int); \
|
||||
template BILU0<n>::~BILU0(); \
|
||||
template bool BILU0<n>::init(BlockedMatrix<n>*); \
|
||||
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
|
||||
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
|
||||
template void BILU0<n>::setOpenCLContext(cl::Context*); \
|
||||
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
|
||||
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
|
||||
template void BILU0<n>::setKernels( \
|
||||
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *, \
|
||||
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
|
||||
);
|
||||
#define INSTANTIATE_BDA_FUNCTIONS(n) \
|
||||
template BILU0<n>::BILU0(ILUReorder, int); \
|
||||
template BILU0<n>::~BILU0(); \
|
||||
template bool BILU0<n>::init(BlockedMatrix<n>*); \
|
||||
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
|
||||
template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&); \
|
||||
template void BILU0<n>::setOpenCLContext(cl::Context*); \
|
||||
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
|
||||
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
INSTANTIATE_BDA_FUNCTIONS(2);
|
||||
|
@ -72,21 +72,11 @@ namespace bda
|
||||
#endif
|
||||
} GPU_storage;
|
||||
|
||||
ilu_apply1_kernel_type *ILU_apply1;
|
||||
ilu_apply2_kernel_type *ILU_apply2;
|
||||
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale;
|
||||
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&,
|
||||
const int, cl::LocalSpaceArg> *ilu_decomp;
|
||||
|
||||
GPU_storage s;
|
||||
cl::Context *context;
|
||||
cl::CommandQueue *queue;
|
||||
std::vector<cl::Event> events;
|
||||
cl_int err;
|
||||
int work_group_size = 0;
|
||||
int total_work_items = 0;
|
||||
int lmem_per_work_group = 0;
|
||||
|
||||
#if CHOW_PATEL
|
||||
ChowPatelIlu<block_size> chowPatelIlu;
|
||||
@ -105,17 +95,10 @@ namespace bda
|
||||
bool create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(cl::Buffer& y, cl::Buffer& x);
|
||||
void apply(const cl::Buffer& y, cl::Buffer& x);
|
||||
|
||||
void setOpenCLContext(cl::Context *context);
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
|
||||
void setKernels(
|
||||
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
|
||||
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
|
||||
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale,
|
||||
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
|
||||
);
|
||||
|
||||
int* getToOrder()
|
||||
{
|
||||
|
@ -18,12 +18,300 @@
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <dune/common/timer.hh>
|
||||
|
||||
#include <opm/simulators/linalg/bda/openclKernels.hpp>
|
||||
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp> // defines CHOW_PATEL
|
||||
|
||||
namespace bda
|
||||
{
|
||||
|
||||
std::string get_axpy_string() {
|
||||
using Opm::OpmLog;
|
||||
using Dune::Timer;
|
||||
|
||||
// define static variables and kernels
|
||||
int OpenclKernels::verbosity;
|
||||
cl::CommandQueue *OpenclKernels::queue;
|
||||
std::vector<double> OpenclKernels::tmp;
|
||||
bool OpenclKernels::initialized = false;
|
||||
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
|
||||
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
|
||||
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
|
||||
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
|
||||
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
|
||||
std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
|
||||
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
|
||||
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
|
||||
{
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
|
||||
sources.emplace_back(source);
|
||||
}
|
||||
|
||||
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) {
|
||||
|
||||
if (initialized) {
|
||||
OpmLog::warning("Warning OpenclKernels is already initialized");
|
||||
return;
|
||||
}
|
||||
|
||||
queue = queue_;
|
||||
verbosity = verbosity_;
|
||||
|
||||
cl::Program::Sources sources;
|
||||
std::string axpy_s = get_axpy_string();
|
||||
add_kernel_string(sources, axpy_s);
|
||||
std::string scale_s = get_scale_string();
|
||||
add_kernel_string(sources, scale_s);
|
||||
std::string dot_1_s = get_dot_1_string();
|
||||
add_kernel_string(sources, dot_1_s);
|
||||
std::string norm_s = get_norm_string();
|
||||
add_kernel_string(sources, norm_s);
|
||||
std::string custom_s = get_custom_string();
|
||||
add_kernel_string(sources, custom_s);
|
||||
std::string spmv_blocked_s = get_spmv_blocked_string();
|
||||
add_kernel_string(sources, spmv_blocked_s);
|
||||
#if CHOW_PATEL
|
||||
bool ilu_operate_on_full_matrix = false;
|
||||
#else
|
||||
bool ilu_operate_on_full_matrix = true;
|
||||
#endif
|
||||
std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
|
||||
add_kernel_string(sources, ILU_apply1_s);
|
||||
std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
|
||||
add_kernel_string(sources, ILU_apply2_s);
|
||||
std::string stdwell_apply_s = get_stdwell_apply_string(true);
|
||||
add_kernel_string(sources, stdwell_apply_s);
|
||||
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
|
||||
add_kernel_string(sources, stdwell_apply_no_reorder_s);
|
||||
std::string ilu_decomp_s = get_ilu_decomp_string();
|
||||
add_kernel_string(sources, ilu_decomp_s);
|
||||
|
||||
cl::Program program = cl::Program(*context, sources);
|
||||
program.build(devices);
|
||||
|
||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||
// cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||
|
||||
// actually creating the kernels
|
||||
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
|
||||
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
|
||||
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
||||
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
|
||||
|
||||
initialized = true;
|
||||
} // end get_opencl_kernels()
|
||||
|
||||
|
||||
|
||||
|
||||
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_dot;
|
||||
tmp.resize(num_work_groups);
|
||||
|
||||
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
|
||||
|
||||
double gpu_sum = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_sum += tmp[i];
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels dot() time: " << t_dot.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
|
||||
return gpu_sum;
|
||||
}
|
||||
|
||||
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_norm;
|
||||
tmp.resize(num_work_groups);
|
||||
|
||||
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
|
||||
|
||||
double gpu_norm = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_norm += tmp[i];
|
||||
}
|
||||
gpu_norm = sqrt(gpu_norm);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels norm() time: " << t_norm.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
|
||||
return gpu_norm;
|
||||
}
|
||||
|
||||
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_axpy;
|
||||
|
||||
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels axpy() time: " << t_axpy.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_scale;
|
||||
|
||||
cl::Event event = (*scale_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels scale() time: " << t_scale.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_custom;
|
||||
|
||||
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels custom() time: " << t_custom.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_spmv;
|
||||
|
||||
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels spmv_blocked() time: " << t_spmv.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_ilu_apply1;
|
||||
|
||||
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, y, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels ILU_apply1() time: " << t_ilu_apply1.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_ilu_apply2;
|
||||
|
||||
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, invDiagVals, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 5) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels ILU_apply2() time: " << t_ilu_apply2.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size)
|
||||
{
|
||||
const unsigned int work_group_size2 = 128;
|
||||
const unsigned int num_work_groups2 = 1024;
|
||||
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
|
||||
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
|
||||
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
|
||||
Timer t_ilu_decomp;
|
||||
|
||||
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, vals, cols, rows, invDiagVals, diagIndex, Nb, cl::Local(lmem_per_work_group2));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "OpenclKernels ILU_decomp() time: " << t_ilu_decomp.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
std::string OpenclKernels::get_axpy_string() {
|
||||
return R"(
|
||||
__kernel void axpy(
|
||||
__global double *in,
|
||||
@ -44,7 +332,7 @@ namespace bda
|
||||
|
||||
|
||||
// scale vector with scalar
|
||||
std::string get_scale_string() {
|
||||
std::string OpenclKernels::get_scale_string() {
|
||||
return R"(
|
||||
__kernel void scale(
|
||||
__global double *vec,
|
||||
@ -64,7 +352,7 @@ namespace bda
|
||||
|
||||
|
||||
// returns partial sums, instead of the final dot product
|
||||
std::string get_dot_1_string() {
|
||||
std::string OpenclKernels::get_dot_1_string() {
|
||||
return R"(
|
||||
__kernel void dot_1(
|
||||
__global double *in1,
|
||||
@ -107,7 +395,7 @@ namespace bda
|
||||
|
||||
// returns partial sums, instead of the final norm
|
||||
// the square root must be computed on CPU
|
||||
std::string get_norm_string() {
|
||||
std::string OpenclKernels::get_norm_string() {
|
||||
return R"(
|
||||
__kernel void norm(
|
||||
__global double *in,
|
||||
@ -148,7 +436,7 @@ namespace bda
|
||||
|
||||
|
||||
// p = (p - omega * v) * beta + r
|
||||
std::string get_custom_string() {
|
||||
std::string OpenclKernels::get_custom_string() {
|
||||
return R"(
|
||||
__kernel void custom(
|
||||
__global double *p,
|
||||
@ -173,7 +461,7 @@ namespace bda
|
||||
)";
|
||||
}
|
||||
|
||||
std::string get_spmv_blocked_string() {
|
||||
std::string OpenclKernels::get_spmv_blocked_string() {
|
||||
return R"(
|
||||
__kernel void spmv_blocked(
|
||||
__global const double *vals,
|
||||
@ -243,7 +531,7 @@ namespace bda
|
||||
|
||||
|
||||
|
||||
std::string get_ILU_apply1_string(bool full_matrix) {
|
||||
std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
__kernel void ILU_apply1(
|
||||
__global const double *LUvals,
|
||||
@ -319,7 +607,7 @@ namespace bda
|
||||
}
|
||||
|
||||
|
||||
std::string get_ILU_apply2_string(bool full_matrix) {
|
||||
std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
|
||||
std::string s = R"(
|
||||
__kernel void ILU_apply2(
|
||||
__global const double *LUvals,
|
||||
@ -402,7 +690,7 @@ namespace bda
|
||||
return s;
|
||||
}
|
||||
|
||||
std::string get_stdwell_apply_string(bool reorder) {
|
||||
std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
|
||||
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
|
||||
std::string s = "__kernel void " + kernel_name + R"((
|
||||
__global const double *Cnnzs,
|
||||
@ -497,7 +785,7 @@ namespace bda
|
||||
}
|
||||
|
||||
|
||||
std::string get_ilu_decomp_string() {
|
||||
std::string OpenclKernels::get_ilu_decomp_string() {
|
||||
return R"(
|
||||
|
||||
// a = a - (b * c)
|
||||
|
@ -21,6 +21,7 @@
|
||||
#define OPENCL_HPP
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
#include <opm/simulators/linalg/bda/opencl.hpp>
|
||||
|
||||
@ -29,7 +30,7 @@ namespace bda
|
||||
|
||||
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
|
||||
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
|
||||
@ -44,56 +45,93 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
|
||||
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
|
||||
|
||||
class OpenclKernels
|
||||
{
|
||||
private:
|
||||
static int verbosity;
|
||||
static cl::CommandQueue *queue;
|
||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||
static bool initialized;
|
||||
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
|
||||
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
||||
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
|
||||
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
||||
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
|
||||
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
||||
|
||||
/// Generate string with axpy kernel
|
||||
/// a = a + alpha * b
|
||||
std::string get_axpy_string();
|
||||
static std::string get_axpy_string();
|
||||
|
||||
/// Generate string with scale kernel
|
||||
/// a = a * alpha
|
||||
std::string get_scale_string();
|
||||
static std::string get_scale_string();
|
||||
|
||||
/// returns partial sums, instead of the final dot product
|
||||
/// partial sums are added on CPU
|
||||
std::string get_dot_1_string();
|
||||
static std::string get_dot_1_string();
|
||||
|
||||
/// returns partial sums, instead of the final norm
|
||||
/// the square root must be computed on CPU
|
||||
std::string get_norm_string();
|
||||
static std::string get_norm_string();
|
||||
|
||||
/// Generate string with custom kernel
|
||||
/// This kernel combines some ilubicgstab vector operations into 1
|
||||
/// p = (p - omega * v) * beta + r
|
||||
std::string get_custom_string();
|
||||
static std::string get_custom_string();
|
||||
|
||||
/// b = mat * x
|
||||
/// algorithm based on:
|
||||
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
|
||||
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
|
||||
std::string get_spmv_blocked_string();
|
||||
static std::string get_spmv_blocked_string();
|
||||
|
||||
/// ILU apply part 1: forward substitution
|
||||
/// solves L*x=y where L is a lower triangular sparse blocked matrix
|
||||
/// this L can be it's own BSR matrix (if full_matrix is false),
|
||||
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
|
||||
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
|
||||
std::string get_ILU_apply1_string(bool full_matrix);
|
||||
static std::string get_ILU_apply1_string(bool full_matrix);
|
||||
|
||||
/// ILU apply part 2: backward substitution
|
||||
/// solves U*x=y where U is an upper triangular sparse blocked matrix
|
||||
/// this U can be it's own BSR matrix (if full_matrix is false),
|
||||
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
|
||||
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
|
||||
std::string get_ILU_apply2_string(bool full_matrix);
|
||||
static std::string get_ILU_apply2_string(bool full_matrix);
|
||||
|
||||
/// Generate string with the stdwell_apply kernels
|
||||
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
|
||||
/// the x/y vector is reordered, use toOrder to address that
|
||||
/// \param[in] reorder whether the matrix is reordered or not
|
||||
std::string get_stdwell_apply_string(bool reorder);
|
||||
static std::string get_stdwell_apply_string(bool reorder);
|
||||
|
||||
/// Generate string with the exact ilu decomposition kernel
|
||||
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
|
||||
std::string get_ilu_decomp_string();
|
||||
static std::string get_ilu_decomp_string();
|
||||
|
||||
OpenclKernels(){}; // disable instantiation
|
||||
|
||||
public:
|
||||
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
|
||||
|
||||
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
|
||||
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
|
||||
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
|
||||
static void scale(cl::Buffer& in, const double a, int N);
|
||||
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
|
||||
static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
|
||||
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
|
||||
static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
|
||||
|
||||
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
|
||||
};
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
|
@ -174,6 +174,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
|
||||
context = std::make_shared<cl::Context>(devices[0]);
|
||||
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
|
||||
|
||||
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
@ -193,125 +195,6 @@ openclSolverBackend<block_size>::~openclSolverBackend() {
|
||||
}
|
||||
|
||||
|
||||
// divide A by B, and round up: return (int)ceil(A/B)
|
||||
template <unsigned int block_size>
|
||||
unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
|
||||
{
|
||||
return A / B + (A % B > 0);
|
||||
}
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_dot;
|
||||
|
||||
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
|
||||
|
||||
double gpu_sum = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_sum += tmp[i];
|
||||
}
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
|
||||
return gpu_sum;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
|
||||
{
|
||||
const unsigned int work_group_size = 256;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_norm;
|
||||
|
||||
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
|
||||
|
||||
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
|
||||
|
||||
double gpu_norm = 0.0;
|
||||
for (unsigned int i = 0; i < num_work_groups; ++i) {
|
||||
gpu_norm += tmp[i];
|
||||
}
|
||||
gpu_norm = sqrt(gpu_norm);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
|
||||
return gpu_norm;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_axpy;
|
||||
|
||||
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
Timer t_custom;
|
||||
|
||||
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
|
||||
{
|
||||
const unsigned int work_group_size = 32;
|
||||
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
|
||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||
Timer t_spmv;
|
||||
|
||||
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream oss;
|
||||
oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s";
|
||||
OpmLog::info(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
|
||||
@ -320,7 +203,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
double norm, norm_0;
|
||||
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
|
||||
// wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
|
||||
}
|
||||
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
@ -348,7 +231,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
|
||||
}
|
||||
|
||||
norm = norm_w(d_r, d_tmp);
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
norm_0 = norm;
|
||||
|
||||
if (verbosity > 1) {
|
||||
@ -360,11 +243,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
t_rest.start();
|
||||
for (it = 0.5; it < maxit; it += 0.5) {
|
||||
rhop = rho;
|
||||
rho = dot_w(d_rw, d_r, d_tmp);
|
||||
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
|
||||
|
||||
if (it > 1) {
|
||||
beta = (rho / rhop) * (alpha / omega);
|
||||
custom_w(d_p, d_v, d_r, omega, beta);
|
||||
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
|
||||
}
|
||||
t_rest.stop();
|
||||
|
||||
@ -375,7 +258,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// v = A * pw
|
||||
t_spmv.start();
|
||||
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
|
||||
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
|
||||
t_spmv.stop();
|
||||
|
||||
// apply wellContributions
|
||||
@ -386,11 +269,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
t_well.stop();
|
||||
|
||||
t_rest.start();
|
||||
tmp1 = dot_w(d_rw, d_v, d_tmp);
|
||||
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
|
||||
alpha = rho / tmp1;
|
||||
axpy_w(d_v, -alpha, d_r); // r = r - alpha * v
|
||||
axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw
|
||||
norm = norm_w(d_r, d_tmp);
|
||||
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
|
||||
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
t_rest.stop();
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
@ -406,7 +289,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
|
||||
// t = A * s
|
||||
t_spmv.start();
|
||||
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
|
||||
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
|
||||
t_spmv.stop();
|
||||
|
||||
// apply wellContributions
|
||||
@ -417,12 +300,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
t_well.stop();
|
||||
|
||||
t_rest.start();
|
||||
tmp1 = dot_w(d_t, d_r, d_tmp);
|
||||
tmp2 = dot_w(d_t, d_t, d_tmp);
|
||||
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
|
||||
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
|
||||
omega = tmp1 / tmp2;
|
||||
axpy_w(d_s, omega, d_x); // x = x + omega * s
|
||||
axpy_w(d_t, -omega, d_r); // r = r - omega * t
|
||||
norm = norm_w(d_r, d_tmp);
|
||||
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
|
||||
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
|
||||
norm = OpenclKernels::norm(d_r, d_tmp, N);
|
||||
t_rest.stop();
|
||||
|
||||
if (norm < tolerance * norm_0) {
|
||||
@ -479,7 +362,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
prec->setOpenCLContext(context.get());
|
||||
prec->setOpenCLQueue(queue.get());
|
||||
|
||||
tmp = new double[N];
|
||||
#if COPY_ROW_BY_ROW
|
||||
vals_contiguous = new double[N];
|
||||
#endif
|
||||
@ -507,10 +389,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
||||
}
|
||||
|
||||
get_opencl_kernels();
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get());
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
|
||||
@ -525,68 +403,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
initialized = true;
|
||||
} // end initialize()
|
||||
|
||||
void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
|
||||
sources.emplace_back(source);
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::get_opencl_kernels() {
|
||||
|
||||
cl::Program::Sources sources;
|
||||
std::string axpy_s = get_axpy_string();
|
||||
add_kernel_string(sources, axpy_s);
|
||||
std::string scale_s = get_scale_string();
|
||||
add_kernel_string(sources, scale_s);
|
||||
std::string dot_1_s = get_dot_1_string();
|
||||
add_kernel_string(sources, dot_1_s);
|
||||
std::string norm_s = get_norm_string();
|
||||
add_kernel_string(sources, norm_s);
|
||||
std::string custom_s = get_custom_string();
|
||||
add_kernel_string(sources, custom_s);
|
||||
std::string spmv_blocked_s = get_spmv_blocked_string();
|
||||
add_kernel_string(sources, spmv_blocked_s);
|
||||
#if CHOW_PATEL
|
||||
bool ilu_operate_on_full_matrix = false;
|
||||
#else
|
||||
bool ilu_operate_on_full_matrix = true;
|
||||
#endif
|
||||
std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
|
||||
add_kernel_string(sources, ILU_apply1_s);
|
||||
std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
|
||||
add_kernel_string(sources, ILU_apply2_s);
|
||||
std::string stdwell_apply_s = get_stdwell_apply_string(true);
|
||||
add_kernel_string(sources, stdwell_apply_s);
|
||||
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
|
||||
add_kernel_string(sources, stdwell_apply_no_reorder_s);
|
||||
std::string ilu_decomp_s = get_ilu_decomp_string();
|
||||
add_kernel_string(sources, ilu_decomp_s);
|
||||
|
||||
cl::Program program = cl::Program(*context, sources);
|
||||
program.build(devices);
|
||||
|
||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||
// cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||
|
||||
// actually creating the kernels
|
||||
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
|
||||
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
||||
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
|
||||
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
|
||||
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
|
||||
} // end get_opencl_kernels()
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::finalize() {
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
delete[] rb;
|
||||
}
|
||||
delete[] tmp;
|
||||
#if COPY_ROW_BY_ROW
|
||||
delete[] vals_contiguous;
|
||||
#endif
|
||||
@ -672,11 +494,6 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
|
||||
Timer t;
|
||||
|
||||
bool success = prec->init(mat.get());
|
||||
int work_group_size = 32;
|
||||
int num_work_groups = ceilDivision(N, work_group_size);
|
||||
int total_work_items = num_work_groups * work_group_size;
|
||||
int lmem_per_work_group = work_group_size * sizeof(double);
|
||||
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
|
||||
|
||||
if (opencl_ilu_reorder == ILUReorder::NONE) {
|
||||
rmat = mat.get();
|
||||
|
@ -61,21 +61,8 @@ private:
|
||||
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
|
||||
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
|
||||
cl::Buffer d_toOrder; // only used when reordering is used
|
||||
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
|
||||
|
||||
// shared pointers are also passed to other objects
|
||||
std::vector<cl::Device> devices;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
|
||||
std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
||||
std::shared_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
|
||||
std::shared_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
|
||||
std::shared_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
|
||||
std::shared_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
|
||||
|
||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||
@ -150,9 +137,6 @@ private:
|
||||
/// \param[in] cols array of columnIndices, contains nnz values
|
||||
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
|
||||
|
||||
/// Generate and compile opencl kernels
|
||||
void get_opencl_kernels();
|
||||
|
||||
/// Clean memory
|
||||
void finalize();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user