mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Tweak ILU opencl kernel launch params
This commit is contained in:
parent
d0aa42e9e3
commit
58b20fe2fb
@ -275,7 +275,7 @@ bool BILU0<block_size>::create_preconditioner(BlockedMatrix *mat, BlockedMatrix
|
|||||||
}
|
}
|
||||||
OpenclKernels::ILU_decomp(firstRow, lastRow,
|
OpenclKernels::ILU_decomp(firstRow, lastRow,
|
||||||
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
s.LUvals, s.LUcols, s.LUrows, s.diagIndex,
|
||||||
s.invDiagVals, Nb, block_size);
|
s.invDiagVals, rowsPerColor[color], block_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (verbosity >= 3) {
|
if (verbosity >= 3) {
|
||||||
@ -303,11 +303,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
|||||||
#if CHOW_PATEL
|
#if CHOW_PATEL
|
||||||
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows,
|
OpenclKernels::ILU_apply1(s.Lvals, s.Lcols, s.Lrows,
|
||||||
s.diagIndex, y, x, s.rowsPerColor,
|
s.diagIndex, y, x, s.rowsPerColor,
|
||||||
color, Nb, block_size);
|
color, rowsPerColor[color], block_size);
|
||||||
#else
|
#else
|
||||||
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows,
|
OpenclKernels::ILU_apply1(s.LUvals, s.LUcols, s.LUrows,
|
||||||
s.diagIndex, y, x, s.rowsPerColor,
|
s.diagIndex, y, x, s.rowsPerColor,
|
||||||
color, Nb, block_size);
|
color, rowsPerColor[color], block_size);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -315,11 +315,11 @@ void BILU0<block_size>::apply(const cl::Buffer& y, cl::Buffer& x)
|
|||||||
#if CHOW_PATEL
|
#if CHOW_PATEL
|
||||||
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows,
|
OpenclKernels::ILU_apply2(s.Uvals, s.Ucols, s.Urows,
|
||||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||||
color, Nb, block_size);
|
color, rowsPerColor[color], block_size);
|
||||||
#else
|
#else
|
||||||
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows,
|
OpenclKernels::ILU_apply2(s.LUvals, s.LUcols, s.LUrows,
|
||||||
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
s.diagIndex, s.invDiagVals, x, s.rowsPerColor,
|
||||||
color, Nb, block_size);
|
color, rowsPerColor[color], block_size);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ __kernel void ILU_apply1(
|
|||||||
const unsigned int block_size,
|
const unsigned int block_size,
|
||||||
__local double *tmp)
|
__local double *tmp)
|
||||||
{
|
{
|
||||||
const unsigned int warpsize = 32;
|
const unsigned int warpsize = get_local_size(0);
|
||||||
const unsigned int bs = block_size;
|
const unsigned int bs = block_size;
|
||||||
const unsigned int idx_t = get_local_id(0);
|
const unsigned int idx_t = get_local_id(0);
|
||||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
@ -14,7 +14,7 @@ __kernel void ILU_apply2(
|
|||||||
const unsigned int block_size,
|
const unsigned int block_size,
|
||||||
__local double *tmp)
|
__local double *tmp)
|
||||||
{
|
{
|
||||||
const unsigned int warpsize = 32;
|
const unsigned int warpsize = get_local_size(0);
|
||||||
const unsigned int bs = block_size;
|
const unsigned int bs = block_size;
|
||||||
const unsigned int idx_t = get_local_id(0);
|
const unsigned int idx_t = get_local_id(0);
|
||||||
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
const unsigned int num_active_threads = (warpsize/bs/bs)*bs*bs;
|
||||||
|
@ -41,6 +41,7 @@ int OpenclKernels::verbosity;
|
|||||||
cl::CommandQueue *OpenclKernels::queue;
|
cl::CommandQueue *OpenclKernels::queue;
|
||||||
std::vector<double> OpenclKernels::tmp;
|
std::vector<double> OpenclKernels::tmp;
|
||||||
bool OpenclKernels::initialized = false;
|
bool OpenclKernels::initialized = false;
|
||||||
|
size_t OpenclKernels::preferred_workgroup_size_multiple = 0;
|
||||||
|
|
||||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
|
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::dot_k;
|
||||||
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > OpenclKernels::norm_k;
|
||||||
@ -140,6 +141,11 @@ void OpenclKernels::init(cl::Context *context, cl::CommandQueue *queue_, std::ve
|
|||||||
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
|
isaiL_k.reset(new isaiL_kernel_type(cl::Kernel(program, "isaiL")));
|
||||||
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
|
isaiU_k.reset(new isaiU_kernel_type(cl::Kernel(program, "isaiU")));
|
||||||
|
|
||||||
|
// testing shows all kernels have the same preferred_workgroup_size_multiple
|
||||||
|
// 32 for NVIDIA
|
||||||
|
// 64 for AMD
|
||||||
|
cl::Kernel(program, "ILU_apply1").getWorkGroupInfo(devices[0], CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, &preferred_workgroup_size_multiple);
|
||||||
|
|
||||||
initialized = true;
|
initialized = true;
|
||||||
} // end get_opencl_kernels()
|
} // end get_opencl_kernels()
|
||||||
|
|
||||||
@ -388,10 +394,10 @@ void OpenclKernels::ILU_apply1(cl::Buffer& vals, cl::Buffer& cols,
|
|||||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||||
const cl::Buffer& y, cl::Buffer& x,
|
const cl::Buffer& y, cl::Buffer& x,
|
||||||
cl::Buffer& rowsPerColor, int color,
|
cl::Buffer& rowsPerColor, int color,
|
||||||
int Nb, unsigned int block_size)
|
int rowsThisColor, unsigned int block_size)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 32;
|
const unsigned int work_group_size = preferred_workgroup_size_multiple;
|
||||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
const unsigned int num_work_groups = rowsThisColor;
|
||||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||||
Timer t_ilu_apply1;
|
Timer t_ilu_apply1;
|
||||||
@ -413,10 +419,10 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
|
|||||||
cl::Buffer& rows, cl::Buffer& diagIndex,
|
cl::Buffer& rows, cl::Buffer& diagIndex,
|
||||||
cl::Buffer& invDiagVals, cl::Buffer& x,
|
cl::Buffer& invDiagVals, cl::Buffer& x,
|
||||||
cl::Buffer& rowsPerColor, int color,
|
cl::Buffer& rowsPerColor, int color,
|
||||||
int Nb, unsigned int block_size)
|
int rowsThisColor, unsigned int block_size)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size = 32;
|
const unsigned int work_group_size = preferred_workgroup_size_multiple;
|
||||||
const unsigned int num_work_groups = ceilDivision(Nb, work_group_size);
|
const unsigned int num_work_groups = rowsThisColor;
|
||||||
const unsigned int total_work_items = num_work_groups * work_group_size;
|
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||||
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
const unsigned int lmem_per_work_group = sizeof(double) * work_group_size;
|
||||||
Timer t_ilu_apply2;
|
Timer t_ilu_apply2;
|
||||||
@ -437,19 +443,19 @@ void OpenclKernels::ILU_apply2(cl::Buffer& vals, cl::Buffer& cols,
|
|||||||
void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
|
void OpenclKernels::ILU_decomp(int firstRow, int lastRow,
|
||||||
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
cl::Buffer& vals, cl::Buffer& cols, cl::Buffer& rows,
|
||||||
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
cl::Buffer& diagIndex, cl::Buffer& invDiagVals,
|
||||||
int Nb, unsigned int block_size)
|
int rowsThisColor, unsigned int block_size)
|
||||||
{
|
{
|
||||||
const unsigned int work_group_size2 = 128;
|
const unsigned int work_group_size = 128;
|
||||||
const unsigned int num_work_groups2 = 1024;
|
const unsigned int num_work_groups = rowsThisColor;
|
||||||
const unsigned int total_work_items2 = num_work_groups2 * work_group_size2;
|
const unsigned int total_work_items = num_work_groups * work_group_size;
|
||||||
const unsigned int num_hwarps_per_group = work_group_size2 / 16;
|
const unsigned int num_hwarps_per_group = work_group_size / 16;
|
||||||
const unsigned int lmem_per_work_group2 = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
|
const unsigned int lmem_per_work_group = num_hwarps_per_group * block_size * block_size * sizeof(double); // each block needs a pivot
|
||||||
Timer t_ilu_decomp;
|
Timer t_ilu_decomp;
|
||||||
|
|
||||||
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)),
|
cl::Event event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||||
firstRow, lastRow, vals, cols, rows,
|
firstRow, lastRow, vals, cols, rows,
|
||||||
invDiagVals, diagIndex, Nb,
|
invDiagVals, diagIndex, rowsThisColor,
|
||||||
cl::Local(lmem_per_work_group2));
|
cl::Local(lmem_per_work_group));
|
||||||
|
|
||||||
if (verbosity >= 4) {
|
if (verbosity >= 4) {
|
||||||
event.wait();
|
event.wait();
|
||||||
|
@ -64,6 +64,7 @@ private:
|
|||||||
static cl::CommandQueue *queue;
|
static cl::CommandQueue *queue;
|
||||||
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
static std::vector<double> tmp; // used as tmp CPU buffer for dot() and norm()
|
||||||
static bool initialized;
|
static bool initialized;
|
||||||
|
static size_t preferred_workgroup_size_multiple; // stores CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
|
||||||
|
|
||||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||||
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
static std::unique_ptr<cl::KernelFunctor<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||||
|
Loading…
Reference in New Issue
Block a user