Merge pull request #3575 from Tongdongq/fix-ilu-decomp-gpu

Create and use scale kernel for relaxation of ILU
This commit is contained in:
Markus Blatt
2021-10-07 15:55:17 +02:00
committed by GitHub
6 changed files with 60 additions and 18 deletions

View File

@@ -584,7 +584,7 @@ BILU0<block_size>::~BILU0()
if (verbosity >= 4) {
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
}
event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
event.wait();
}
@@ -601,29 +601,34 @@ BILU0<block_size>::~BILU0()
// however, if individual kernel calls are timed, waiting for events is needed
// behavior on other GPUs is untested
template <unsigned int block_size>
void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
void BILU0<block_size>::apply(cl::Buffer& y, cl::Buffer& x)
{
const double relaxation = 0.9;
cl::Event event;
Timer t_apply;
for(int color = 0; color < numColors; ++color){
#if CHOW_PATEL
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#else
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#endif
// event.wait();
}
for(int color = numColors-1; color >= 0; --color){
#if CHOW_PATEL
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#else
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
#endif
// event.wait();
}
// apply relaxation
event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N);
event.wait();
if (verbosity >= 4) {
event.wait();
std::ostringstream out;
@@ -651,11 +656,13 @@ template <unsigned int block_size>
void BILU0<block_size>::setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale_,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_
){
this->ILU_apply1 = ILU_apply1_;
this->ILU_apply2 = ILU_apply2_;
this->ilu_decomp_k = ilu_decomp_k_;
this->scale = scale_;
this->ilu_decomp = ilu_decomp_;
}
@@ -672,7 +679,8 @@ template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned
template void BILU0<n>::setKernels( \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *, \
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
);
INSTANTIATE_BDA_FUNCTIONS(1);

View File

@@ -85,9 +85,10 @@ namespace bda
ilu_apply1_kernel_type *ILU_apply1;
ilu_apply2_kernel_type *ILU_apply2;
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale;
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&,
const int, cl::LocalSpaceArg> *ilu_decomp_k;
const int, cl::LocalSpaceArg> *ilu_decomp;
GPU_storage s;
cl::Context *context;
@@ -114,8 +115,8 @@ namespace bda
// ilu_decomposition
bool create_preconditioner(BlockedMatrix<block_size> *mat);
// apply preconditioner, y = prec(x)
void apply(cl::Buffer& x, cl::Buffer& y);
// apply preconditioner, x = prec(y)
void apply(cl::Buffer& y, cl::Buffer& x);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
@@ -123,7 +124,8 @@ namespace bda
void setKernels(
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale,
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
);
int* getToOrder()

View File

@@ -42,6 +42,26 @@ namespace bda
}
// scale vector with scalar
std::string get_scale_string() {
return R"(
__kernel void scale(
__global double *vec,
const double a,
const int N)
{
unsigned int NUM_THREADS = get_global_size(0);
int idx = get_global_id(0);
while(idx < N){
vec[idx] *= a;
idx += NUM_THREADS;
}
}
)";
}
// returns partial sums, instead of the final dot product
std::string get_dot_1_string() {
return R"(
@@ -319,13 +339,12 @@ namespace bda
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
const unsigned int NUM_THREADS = get_global_size(0);
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
unsigned int idx = get_global_id(0);
unsigned int target_block_row = idx / warpsize;
unsigned int idx_g = get_global_id(0);
unsigned int target_block_row = idx_g / warpsize;
target_block_row += nodesPerColorPrefix[color];
const unsigned int lane = idx_t % warpsize;
const unsigned int c = (lane / bs) % bs;
const unsigned int r = lane % bs;
const double relaxation = 0.9;
while(target_block_row < nodesPerColorPrefix[color+1]){
)";
@@ -372,7 +391,7 @@ namespace bda
}
const unsigned int row = target_block_row*bs + lane;
x[row] = relaxation * sum;
x[row] = sum;
}
target_block_row += num_warps_in_grid;

View File

@@ -48,6 +48,10 @@ using ilu_decomp_kernel_type = cl::make_kernel<const unsigned int, const unsigne
/// a = a + alpha * b
std::string get_axpy_string();
/// Generate string with scale kernel
/// a = a * alpha
std::string get_scale_string();
/// returns partial sums, instead of the final dot product
/// partial sums are added on CPU
std::string get_dot_1_string();

View File

@@ -509,7 +509,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
get_opencl_kernels();
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
@@ -535,6 +535,8 @@ void openclSolverBackend<block_size>::get_opencl_kernels() {
cl::Program::Sources sources;
std::string axpy_s = get_axpy_string();
add_kernel_string(sources, axpy_s);
std::string scale_s = get_scale_string();
add_kernel_string(sources, scale_s);
std::string dot_1_s = get_dot_1_string();
add_kernel_string(sources, dot_1_s);
std::string norm_s = get_norm_string();
@@ -569,6 +571,7 @@ void openclSolverBackend<block_size>::get_opencl_kernels() {
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
scale_k.reset(new cl::make_kernel<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));

View File

@@ -68,6 +68,7 @@ private:
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, const unsigned int> > scale_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
@@ -111,6 +112,11 @@ private:
/// \param[inout] out output vector
void axpy_w(cl::Buffer in, const double a, cl::Buffer out);
/// Perform scale: vec *= a
/// \param[inout] vec vector to scale
/// \param[in] a scalar value to multiply vector
void scale_w(cl::Buffer vec, const double a);
/// Custom function that combines scale, axpy and add functions in bicgstab
/// p = (p - omega * v) * beta + r
/// \param[inout] p output vector