Move opencl kernels to static class

This commit is contained in:
Tong Dong Qiu 2021-11-04 14:25:22 +01:00
parent b8587daaa7
commit 1583292bf4
6 changed files with 381 additions and 305 deletions

View File

@ -241,16 +241,10 @@ BILU0<block_size>::~BILU0()
for (int color = 0; color < numColors; ++color) {
const unsigned int firstRow = rowsPerColorPrefix[color];
const unsigned int lastRow = rowsPerColorPrefix[color+1];
const unsigned int work_group_size2 = 128;
const unsigned int num_work_groups2 = 1024;
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * bs * bs * sizeof(double); // each block needs a pivot
if (verbosity >= 4) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
event.wait();
OpenclKernels::ILU_decomp(firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, Nb, block_size);
}
if (verbosity >= 3) {
@ -266,7 +260,7 @@ BILU0<block_size>::~BILU0()
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
template <unsigned int block_size>
void BILU0<block_size>::apply(cl::Buffer& y, cl::Buffer& x)
void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
{
const double relaxation = 0.9;
cl::Event event;
@ -274,28 +268,24 @@ BILU0<block_size>::~BILU0()
for(int color = 0; color < numColors; ++color){
#if CHOW_PATEL
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#else
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, Nb, block_size);
#endif
// event.wait();
}
for(int color = numColors-1; color >= 0; --color){
#if CHOW_PATEL
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#else
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, Nb, block_size);
#endif
// event.wait();
}
// apply relaxation
event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N);
event.wait();
OpenclKernels::scale(x, relaxation, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream out;
out << "BILU0 apply: " << t_apply.stop() << " s";
OpmLog::info(out.str());
@ -311,41 +301,17 @@ template <unsigned int block_size>
void BILU0<block_size>::setOpenCLQueue(cl::CommandQueue *queue_) {
this->queue = queue_;
}
template <unsigned int block_size>
void BILU0<block_size>::setKernelParameters(const unsigned int work_group_size_, const unsigned int total_work_items_, const unsigned int lmem_per_work_group_) {
this->work_group_size = work_group_size_;
this->total_work_items = total_work_items_;
this->lmem_per_work_group = lmem_per_work_group_;
}
template <unsigned int block_size>
void BILU0<block_size>::setKernels(
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale_,
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_
){
this->ILU_apply1 = ILU_apply1_;
this->ILU_apply2 = ILU_apply2_;
this->scale = scale_;
this->ilu_decomp = ilu_decomp_;
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(ILUReorder, int); \
template BILU0<n>::~BILU0(); \
template bool BILU0<n>::init(BlockedMatrix<n>*); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
template void BILU0<n>::apply(cl::Buffer& x, cl::Buffer& y); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*); \
template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned int); \
template void BILU0<n>::setKernels( \
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *, \
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
);
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BILU0<n>::BILU0(ILUReorder, int); \
template BILU0<n>::~BILU0(); \
template bool BILU0<n>::init(BlockedMatrix<n>*); \
template bool BILU0<n>::create_preconditioner(BlockedMatrix<n>*); \
template void BILU0<n>::apply(const cl::Buffer&, cl::Buffer&); \
template void BILU0<n>::setOpenCLContext(cl::Context*); \
template void BILU0<n>::setOpenCLQueue(cl::CommandQueue*);
INSTANTIATE_BDA_FUNCTIONS(1);
INSTANTIATE_BDA_FUNCTIONS(2);

View File

@ -72,21 +72,11 @@ namespace bda
#endif
} GPU_storage;
ilu_apply1_kernel_type *ILU_apply1;
ilu_apply2_kernel_type *ILU_apply2;
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale;
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg> *ilu_decomp;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
std::vector<cl::Event> events;
cl_int err;
int work_group_size = 0;
int total_work_items = 0;
int lmem_per_work_group = 0;
#if CHOW_PATEL
ChowPatelIlu<block_size> chowPatelIlu;
@ -105,17 +95,10 @@ namespace bda
bool create_preconditioner(BlockedMatrix<block_size> *mat);
// apply preconditioner, x = prec(y)
void apply(cl::Buffer& y, cl::Buffer& x);
void apply(const cl::Buffer& y, cl::Buffer& x);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernelParameters(const unsigned int work_group_size, const unsigned int total_work_items, const unsigned int lmem_per_work_group);
void setKernels(
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> *scale,
cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
);
int* getToOrder()
{

View File

@ -18,12 +18,300 @@
*/
#include <config.h>
#include <cmath>
#include <sstream>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <opm/simulators/linalg/bda/ChowPatelIlu.hpp> // defines CHOW_PATEL
namespace bda
{
std::string get_axpy_string() {
using Opm::OpmLog;
using Dune::Timer;
// define static variables and kernels
int OpenclKernels::verbosity;
cl::CommandQueue *OpenclKernels::queue;
std::vector<double> OpenclKernels::tmp;
bool OpenclKernels::initialized = false;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > OpenclKernels::axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > OpenclKernels::scale_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > OpenclKernels::custom_k;
std::unique_ptr<spmv_kernel_type> OpenclKernels::spmv_blocked_k;
std::unique_ptr<ilu_apply1_kernel_type> OpenclKernels::ILU_apply1_k;
std::unique_ptr<ilu_apply2_kernel_type> OpenclKernels::ILU_apply2_k;
std::unique_ptr<stdwell_apply_kernel_type> OpenclKernels::stdwell_apply_k;
std::unique_ptr<stdwell_apply_no_reorder_kernel_type> OpenclKernels::stdwell_apply_no_reorder_k;
std::unique_ptr<ilu_decomp_kernel_type> OpenclKernels::ilu_decomp_k;
// divide A by B, and round up: return (int)ceil(A/B)
unsigned int ceilDivision(const unsigned int A, const unsigned int B)
{
return A / B + (A % B > 0);
}
void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
sources.emplace_back(source);
}
void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::vector<cl::Device>& devices, int verbosity_) {
if (initialized) {
OpmLog::warning("Warning OpenclKernels is already initialized");
return;
}
queue = queue_;
verbosity = verbosity_;
cl::Program::Sources sources;
std::string axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
std::string scale_s = get_scale_string();
add_kernel_string(sources, scale_s);
std::string dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
std::string norm_s = get_norm_string();
add_kernel_string(sources, norm_s);
std::string custom_s = get_custom_string();
add_kernel_string(sources, custom_s);
std::string spmv_blocked_s = get_spmv_blocked_string();
add_kernel_string(sources, spmv_blocked_s);
#if CHOW_PATEL
bool ilu_operate_on_full_matrix = false;
#else
bool ilu_operate_on_full_matrix = true;
#endif
std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply1_s);
std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply2_s);
std::string stdwell_apply_s = get_stdwell_apply_string(true);
add_kernel_string(sources, stdwell_apply_s);
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
add_kernel_string(sources, stdwell_apply_no_reorder_s);
std::string ilu_decomp_s = get_ilu_decomp_string();
add_kernel_string(sources, ilu_decomp_s);
cl::Program program = cl::Program(*context, sources);
program.build(devices);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
initialized = true;
} // end get_opencl_kernels()
double OpenclKernels::dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_dot;
tmp.resize(num_work_groups);
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
double gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i];
}
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels dot() time: " << t_dot.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_sum;
}
double OpenclKernels::norm(cl::Buffer& in, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_norm;
tmp.resize(num_work_groups);
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp.data());
double gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i];
}
gpu_norm = sqrt(gpu_norm);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels norm() time: " << t_norm.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_norm;
}
void OpenclKernels::axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_axpy;
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels axpy() time: " << t_axpy.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::scale(cl::Buffer& in, const double a, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_scale;
cl::Event event = (*scale_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels scale() time: " << t_scale.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_custom;
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels custom() time: " << t_custom.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_spmv;
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels spmv_blocked() time: " << t_spmv.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_ilu_apply1;
cl::Event event = (*ILU_apply1_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, y, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels ILU_apply1() time: " << t_ilu_apply1.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_ilu_apply2;
cl::Event event = (*ILU_apply2_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, diagIndex, invDiagVals, x, rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 5) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels ILU_apply2() time: " << t_ilu_apply2.stop() << " s";
OpmLog::info(oss.str());
}
}
void OpenclKernels::ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size)
{
const unsigned int work_group_size2 = 128;
const unsigned int num_work_groups2 = 1024;
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
Timer t_ilu_decomp;
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, vals, cols, rows, invDiagVals, diagIndex, Nb, cl::Local(lmem_per_work_group2));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "OpenclKernels ILU_decomp() time: " << t_ilu_decomp.stop() << " s";
OpmLog::info(oss.str());
}
}
std::string OpenclKernels::get_axpy_string() {
return R"(
__kernel void axpy(
__global double *in,
@ -44,7 +332,7 @@ namespace bda
// scale vector with scalar
std::string get_scale_string() {
std::string OpenclKernels::get_scale_string() {
return R"(
__kernel void scale(
__global double *vec,
@ -64,7 +352,7 @@ namespace bda
// returns partial sums, instead of the final dot product
std::string get_dot_1_string() {
std::string OpenclKernels::get_dot_1_string() {
return R"(
__kernel void dot_1(
__global double *in1,
@ -107,7 +395,7 @@ namespace bda
// returns partial sums, instead of the final norm
// the square root must be computed on CPU
std::string get_norm_string() {
std::string OpenclKernels::get_norm_string() {
return R"(
__kernel void norm(
__global double *in,
@ -148,7 +436,7 @@ namespace bda
// p = (p - omega * v) * beta + r
std::string get_custom_string() {
std::string OpenclKernels::get_custom_string() {
return R"(
__kernel void custom(
__global double *p,
@ -173,7 +461,7 @@ namespace bda
)";
}
std::string get_spmv_blocked_string() {
std::string OpenclKernels::get_spmv_blocked_string() {
return R"(
__kernel void spmv_blocked(
__global const double *vals,
@ -243,7 +531,7 @@ namespace bda
std::string get_ILU_apply1_string(bool full_matrix) {
std::string OpenclKernels::get_ILU_apply1_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply1(
__global const double *LUvals,
@ -319,7 +607,7 @@ namespace bda
}
std::string get_ILU_apply2_string(bool full_matrix) {
std::string OpenclKernels::get_ILU_apply2_string(bool full_matrix) {
std::string s = R"(
__kernel void ILU_apply2(
__global const double *LUvals,
@ -402,7 +690,7 @@ namespace bda
return s;
}
std::string get_stdwell_apply_string(bool reorder) {
std::string OpenclKernels::get_stdwell_apply_string(bool reorder) {
std::string kernel_name = reorder ? "stdwell_apply" : "stdwell_apply_no_reorder";
std::string s = "__kernel void " + kernel_name + R"((
__global const double *Cnnzs,
@ -497,7 +785,7 @@ namespace bda
}
std::string get_ilu_decomp_string() {
std::string OpenclKernels::get_ilu_decomp_string() {
return R"(
// a = a - (b * c)

View File

@ -21,6 +21,7 @@
#define OPENCL_HPP
#include <string>
#include <memory>
#include <opm/simulators/linalg/bda/opencl.hpp>
@ -29,7 +30,7 @@ namespace bda
using spmv_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int,
cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
using ilu_apply1_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
using ilu_apply2_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>;
@ -44,56 +45,93 @@ using stdwell_apply_no_reorder_kernel_type = cl::KernelFunctor<cl::Buffer&, cl::
using ilu_decomp_kernel_type = cl::KernelFunctor<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg>;
class OpenclKernels
{
private:
static int verbosity;
static cl::CommandQueue *queue;
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
static bool initialized;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
static std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
static std::unique_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
static std::unique_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
static std::unique_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
static std::unique_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
static std::unique_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
/// Generate string with axpy kernel
/// a = a + alpha * b
std::string get_axpy_string();
static std::string get_axpy_string();
/// Generate string with scale kernel
/// a = a * alpha
std::string get_scale_string();
static std::string get_scale_string();
/// returns partial sums, instead of the final dot product
/// partial sums are added on CPU
std::string get_dot_1_string();
static std::string get_dot_1_string();
/// returns partial sums, instead of the final norm
/// the square root must be computed on CPU
std::string get_norm_string();
static std::string get_norm_string();
/// Generate string with custom kernel
/// This kernel combines some ilubicgstab vector operations into 1
/// p = (p - omega * v) * beta + r
std::string get_custom_string();
static std::string get_custom_string();
/// b = mat * x
/// algorithm based on:
/// Optimization of Block Sparse Matrix-Vector Multiplication on Shared-MemoryParallel Architectures,
/// Ryan Eberhardt, Mark Hoemmen, 2016, https://doi.org/10.1109/IPDPSW.2016.42
std::string get_spmv_blocked_string();
static std::string get_spmv_blocked_string();
/// ILU apply part 1: forward substitution
/// solves L*x=y where L is a lower triangular sparse blocked matrix
/// this L can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of L end
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
std::string get_ILU_apply1_string(bool full_matrix);
static std::string get_ILU_apply1_string(bool full_matrix);
/// ILU apply part 2: backward substitution
/// solves U*x=y where U is an upper triangular sparse blocked matrix
/// this U can be it's own BSR matrix (if full_matrix is false),
/// or it can be inside a normal, square matrix, in that case diagIndex indicates where the rows of U start
/// \param[in] full_matrix whether the kernel should operate on a full (square) matrix or not
std::string get_ILU_apply2_string(bool full_matrix);
static std::string get_ILU_apply2_string(bool full_matrix);
/// Generate string with the stdwell_apply kernels
/// If reorder is true, the B/Ccols do not correspond with the x/y vector
/// the x/y vector is reordered, use toOrder to address that
/// \param[in] reorder whether the matrix is reordered or not
std::string get_stdwell_apply_string(bool reorder);
static std::string get_stdwell_apply_string(bool reorder);
/// Generate string with the exact ilu decomposition kernel
/// The kernel takes a full BSR matrix and performs inplace ILU decomposition
std::string get_ilu_decomp_string();
static std::string get_ilu_decomp_string();
OpenclKernels(){}; // disable instantiation
public:
static void init(cl::Context *context, cl::CommandQueue *queue, std::vector<cl::Device>& devices, int verbosity);
static double dot(cl::Buffer& in1, cl::Buffer& in2, cl::Buffer& out, int N);
static double norm(cl::Buffer& in, cl::Buffer& out, int N);
static void axpy(cl::Buffer& in, const double a, cl::Buffer& out, int N);
static void scale(cl::Buffer& in, const double a, int N);
static void custom(cl::Buffer& p, cl::Buffer& v, cl::Buffer& r, const double omega, const double beta, int N);
static void spmv_blocked(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& x, cl::Buffer& b, int Nb, unsigned int block_size);
static void ILU_apply1(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, const cl::Buffer& y, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_apply2(cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, cl::Buffer& x, cl::Buffer& rowsPerColor, int color, int Nb, unsigned int block_size);
static void ILU_decomp(int firstRow, int lastRow, cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows, cl::Buffer& diagIndex, cl::Buffer& invDiagVals, int Nb, unsigned int block_size);
};
} // end namespace bda

View File

@ -174,6 +174,8 @@ openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_,
context = std::make_shared<cl::Context>(devices[0]);
queue.reset(new cl::CommandQueue(*context, devices[0], 0, &err));
OpenclKernels::init(context.get(), queue.get(), devices, verbosity);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -193,125 +195,6 @@ openclSolverBackend<block_size>::~openclSolverBackend() {
}
// divide A by B, and round up: return (int)ceil(A/B)
template <unsigned int block_size>
unsigned int openclSolverBackend<block_size>::ceilDivision(const unsigned int A, const unsigned int B)
{
return A / B + (A % B > 0);
}
template <unsigned int block_size>
double openclSolverBackend<block_size>::dot_w(cl::Buffer in1, cl::Buffer in2, cl::Buffer out)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_dot;
cl::Event event = (*dot_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in1, in2, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
double gpu_sum = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_sum += tmp[i];
}
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver dot_w time: " << t_dot.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_sum;
}
template <unsigned int block_size>
double openclSolverBackend<block_size>::norm_w(cl::Buffer in, cl::Buffer out)
{
const unsigned int work_group_size = 256;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_norm;
cl::Event event = (*norm_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, out, N, cl::Local(lmem_per_work_group));
queue->enqueueReadBuffer(out, CL_TRUE, 0, sizeof(double) * num_work_groups, tmp);
double gpu_norm = 0.0;
for (unsigned int i = 0; i < num_work_groups; ++i) {
gpu_norm += tmp[i];
}
gpu_norm = sqrt(gpu_norm);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver norm_w time: " << t_norm.stop() << " s";
OpmLog::info(oss.str());
}
return gpu_norm;
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::axpy_w(cl::Buffer in, const double a, cl::Buffer out)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_axpy;
cl::Event event = (*axpy_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), in, a, out, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver axpy_w time: " << t_axpy.stop() << " s";
OpmLog::info(oss.str());
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::custom_w(cl::Buffer p, cl::Buffer v, cl::Buffer r, const double omega, const double beta)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
Timer t_custom;
cl::Event event = (*custom_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), p, v, r, omega, beta, N);
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver custom_w time: " << t_custom.stop() << " s";
OpmLog::info(oss.str());
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b)
{
const unsigned int work_group_size = 32;
const unsigned int num_work_groups = ceilDivision(N, work_group_size);
const unsigned int total_work_items = num_work_groups * work_group_size;
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
Timer t_spmv;
cl::Event event = (*spmv_blocked_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), vals, cols, rows, Nb, x, b, block_size, cl::Local(lmem_per_work_group));
if (verbosity >= 4) {
event.wait();
std::ostringstream oss;
oss << std::scientific << "openclSolver spmv_blocked_w time: " << t_spmv.stop() << " s";
OpmLog::info(oss.str());
}
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
@ -320,7 +203,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
double norm, norm_0;
if(wellContribs.getNumWells() > 0){
wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
// wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
}
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
@ -348,7 +231,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
OPM_THROW(std::logic_error, "openclSolverBackend OpenCL enqueue[Fill|Copy]Buffer error");
}
norm = norm_w(d_r, d_tmp);
norm = OpenclKernels::norm(d_r, d_tmp, N);
norm_0 = norm;
if (verbosity > 1) {
@ -360,11 +243,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_rest.start();
for (it = 0.5; it < maxit; it += 0.5) {
rhop = rho;
rho = dot_w(d_rw, d_r, d_tmp);
rho = OpenclKernels::dot(d_rw, d_r, d_tmp, N);
if (it > 1) {
beta = (rho / rhop) * (alpha / omega);
custom_w(d_p, d_v, d_r, omega, beta);
OpenclKernels::custom(d_p, d_v, d_r, omega, beta, N);
}
t_rest.stop();
@ -375,7 +258,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// v = A * pw
t_spmv.start();
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_pw, d_v);
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_pw, d_v, Nb, block_size);
t_spmv.stop();
// apply wellContributions
@ -386,11 +269,11 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_well.stop();
t_rest.start();
tmp1 = dot_w(d_rw, d_v, d_tmp);
tmp1 = OpenclKernels::dot(d_rw, d_v, d_tmp, N);
alpha = rho / tmp1;
axpy_w(d_v, -alpha, d_r); // r = r - alpha * v
axpy_w(d_pw, alpha, d_x); // x = x + alpha * pw
norm = norm_w(d_r, d_tmp);
OpenclKernels::axpy(d_v, -alpha, d_r, N); // r = r - alpha * v
OpenclKernels::axpy(d_pw, alpha, d_x, N); // x = x + alpha * pw
norm = OpenclKernels::norm(d_r, d_tmp, N);
t_rest.stop();
if (norm < tolerance * norm_0) {
@ -406,7 +289,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// t = A * s
t_spmv.start();
spmv_blocked_w(d_Avals, d_Acols, d_Arows, d_s, d_t);
OpenclKernels::spmv_blocked(d_Avals, d_Acols, d_Arows, d_s, d_t, Nb, block_size);
t_spmv.stop();
// apply wellContributions
@ -417,12 +300,12 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
t_well.stop();
t_rest.start();
tmp1 = dot_w(d_t, d_r, d_tmp);
tmp2 = dot_w(d_t, d_t, d_tmp);
tmp1 = OpenclKernels::dot(d_t, d_r, d_tmp, N);
tmp2 = OpenclKernels::dot(d_t, d_t, d_tmp, N);
omega = tmp1 / tmp2;
axpy_w(d_s, omega, d_x); // x = x + omega * s
axpy_w(d_t, -omega, d_r); // r = r - omega * t
norm = norm_w(d_r, d_tmp);
OpenclKernels::axpy(d_s, omega, d_x, N); // x = x + omega * s
OpenclKernels::axpy(d_t, -omega, d_r, N); // r = r - omega * t
norm = OpenclKernels::norm(d_r, d_tmp, N);
t_rest.stop();
if (norm < tolerance * norm_0) {
@ -479,7 +362,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
tmp = new double[N];
#if COPY_ROW_BY_ROW
vals_contiguous = new double[N];
#endif
@ -507,10 +389,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
get_opencl_kernels();
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
@ -525,68 +403,12 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
initialized = true;
} // end initialize()
void add_kernel_string(cl::Program::Sources &sources, std::string &source) {
sources.emplace_back(source);
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::get_opencl_kernels() {
cl::Program::Sources sources;
std::string axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
std::string scale_s = get_scale_string();
add_kernel_string(sources, scale_s);
std::string dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
std::string norm_s = get_norm_string();
add_kernel_string(sources, norm_s);
std::string custom_s = get_custom_string();
add_kernel_string(sources, custom_s);
std::string spmv_blocked_s = get_spmv_blocked_string();
add_kernel_string(sources, spmv_blocked_s);
#if CHOW_PATEL
bool ilu_operate_on_full_matrix = false;
#else
bool ilu_operate_on_full_matrix = true;
#endif
std::string ILU_apply1_s = get_ILU_apply1_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply1_s);
std::string ILU_apply2_s = get_ILU_apply2_string(ilu_operate_on_full_matrix);
add_kernel_string(sources, ILU_apply2_s);
std::string stdwell_apply_s = get_stdwell_apply_string(true);
add_kernel_string(sources, stdwell_apply_s);
std::string stdwell_apply_no_reorder_s = get_stdwell_apply_string(false);
add_kernel_string(sources, stdwell_apply_no_reorder_s);
std::string ilu_decomp_s = get_ilu_decomp_string();
add_kernel_string(sources, ilu_decomp_s);
cl::Program program = cl::Program(*context, sources);
program.build(devices);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::KernelFunctor<> myKernel(); myKernel(args, arg1, arg2); is also blocking
// actually creating the kernels
dot_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::KernelFunctor<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
custom_k.reset(new cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new ilu_apply2_kernel_type(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new stdwell_apply_kernel_type(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new stdwell_apply_no_reorder_kernel_type(cl::Kernel(program, "stdwell_apply_no_reorder")));
ilu_decomp_k.reset(new ilu_decomp_kernel_type(cl::Kernel(program, "ilu_decomp")));
} // end get_opencl_kernels()
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
if (opencl_ilu_reorder != ILUReorder::NONE) {
delete[] rb;
}
delete[] tmp;
#if COPY_ROW_BY_ROW
delete[] vals_contiguous;
#endif
@ -672,11 +494,6 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
Timer t;
bool success = prec->init(mat.get());
int work_group_size = 32;
int num_work_groups = ceilDivision(N, work_group_size);
int total_work_items = num_work_groups * work_group_size;
int lmem_per_work_group = work_group_size * sizeof(double);
prec->setKernelParameters(work_group_size, total_work_items, lmem_per_work_group);
if (opencl_ilu_reorder == ILUReorder::NONE) {
rmat = mat.get();

View File

@ -61,21 +61,8 @@ private:
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
cl::Buffer d_toOrder; // only used when reordering is used
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
// shared pointers are also passed to other objects
std::vector<cl::Device> devices;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, const double, const unsigned int> > scale_k;
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
std::shared_ptr<ilu_apply2_kernel_type> ILU_apply2_k;
std::shared_ptr<stdwell_apply_kernel_type> stdwell_apply_k;
std::shared_ptr<stdwell_apply_no_reorder_kernel_type> stdwell_apply_no_reorder_k;
std::shared_ptr<ilu_decomp_kernel_type> ilu_decomp_k;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
@ -150,9 +137,6 @@ private:
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// Generate and compile opencl kernels
void get_opencl_kernels();
/// Clean memory
void finalize();