mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Merge pull request #3575 from Tongdongq/fix-ilu-decomp-gpu
Create and use scale kernel for relaxation of ILU
This commit is contained in:
@@ -584,7 +584,7 @@ BILU0<block_size>::~BILU0()
|
||||
if (verbosity >= 4) {
|
||||
out << "color " << color << ": " << firstRow << " - " << lastRow << " = " << lastRow - firstRow << "\n";
|
||||
}
|
||||
event = (*ilu_decomp_k)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
|
||||
event = (*ilu_decomp)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items2), cl::NDRange(work_group_size2)), firstRow, lastRow, s.LUvals, s.LUcols, s.LUrows, s.invDiagVals, s.diagIndex, LUmat->Nb, cl::Local(lmem_per_work_group2));
|
||||
event.wait();
|
||||
}
|
||||
|
||||
@@ -601,29 +601,34 @@ BILU0<block_size>::~BILU0()
|
||||
// however, if individual kernel calls are timed, waiting for events is needed
|
||||
// behavior on other GPUs is untested
|
||||
template <unsigned int block_size>
|
||||
void BILU0<block_size>::apply(cl::Buffer& x, cl::Buffer& y)
|
||||
void BILU0<block_size>::apply(cl::Buffer& y, cl::Buffer& x)
|
||||
{
|
||||
const double relaxation = 0.9;
|
||||
cl::Event event;
|
||||
Timer t_apply;
|
||||
|
||||
for(int color = 0; color < numColors; ++color){
|
||||
#if CHOW_PATEL
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Lvals, s.Lcols, s.Lrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#else
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, x, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
event = (*ILU_apply1)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, y, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#endif
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
for(int color = numColors-1; color >= 0; --color){
|
||||
#if CHOW_PATEL
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.Uvals, s.Ucols, s.Urows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#else
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, y, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
event = (*ILU_apply2)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), s.LUvals, s.LUcols, s.LUrows, s.diagIndex, s.invDiagVals, x, s.rowsPerColor, color, block_size, cl::Local(lmem_per_work_group));
|
||||
#endif
|
||||
// event.wait();
|
||||
}
|
||||
|
||||
// apply relaxation
|
||||
event = (*scale)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)), x, relaxation, N);
|
||||
event.wait();
|
||||
|
||||
if (verbosity >= 4) {
|
||||
event.wait();
|
||||
std::ostringstream out;
|
||||
@@ -651,11 +656,13 @@ template <unsigned int block_size>
|
||||
void BILU0<block_size>::setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1_,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2_,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k_
|
||||
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale_,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_
|
||||
){
|
||||
this->ILU_apply1 = ILU_apply1_;
|
||||
this->ILU_apply2 = ILU_apply2_;
|
||||
this->ilu_decomp_k = ilu_decomp_k_;
|
||||
this->scale = scale_;
|
||||
this->ilu_decomp = ilu_decomp_;
|
||||
}
|
||||
|
||||
|
||||
@@ -672,7 +679,8 @@ template void BILU0<n>::setKernelParameters(unsigned int, unsigned int, unsigned
|
||||
template void BILU0<n>::setKernels( \
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *, \
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
|
||||
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *, \
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> * \
|
||||
);
|
||||
|
||||
INSTANTIATE_BDA_FUNCTIONS(1);
|
||||
|
||||
@@ -85,9 +85,10 @@ namespace bda
|
||||
|
||||
ilu_apply1_kernel_type *ILU_apply1;
|
||||
ilu_apply2_kernel_type *ILU_apply2;
|
||||
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale;
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&,
|
||||
const int, cl::LocalSpaceArg> *ilu_decomp_k;
|
||||
const int, cl::LocalSpaceArg> *ilu_decomp;
|
||||
|
||||
GPU_storage s;
|
||||
cl::Context *context;
|
||||
@@ -114,8 +115,8 @@ namespace bda
|
||||
// ilu_decomposition
|
||||
bool create_preconditioner(BlockedMatrix<block_size> *mat);
|
||||
|
||||
// apply preconditioner, y = prec(x)
|
||||
void apply(cl::Buffer& x, cl::Buffer& y);
|
||||
// apply preconditioner, x = prec(y)
|
||||
void apply(cl::Buffer& y, cl::Buffer& x);
|
||||
|
||||
void setOpenCLContext(cl::Context *context);
|
||||
void setOpenCLQueue(cl::CommandQueue *queue);
|
||||
@@ -123,7 +124,8 @@ namespace bda
|
||||
void setKernels(
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply1,
|
||||
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> *ILU_apply2,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp_k
|
||||
cl::make_kernel<cl::Buffer&, const double, const unsigned int> *scale,
|
||||
cl::make_kernel<const unsigned int, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const int, cl::LocalSpaceArg> *ilu_decomp
|
||||
);
|
||||
|
||||
int* getToOrder()
|
||||
|
||||
@@ -42,6 +42,26 @@ namespace bda
|
||||
}
|
||||
|
||||
|
||||
// scale vector with scalar
|
||||
std::string get_scale_string() {
|
||||
return R"(
|
||||
__kernel void scale(
|
||||
__global double *vec,
|
||||
const double a,
|
||||
const int N)
|
||||
{
|
||||
unsigned int NUM_THREADS = get_global_size(0);
|
||||
int idx = get_global_id(0);
|
||||
|
||||
while(idx < N){
|
||||
vec[idx] *= a;
|
||||
idx += NUM_THREADS;
|
||||
}
|
||||
}
|
||||
)";
|
||||
}
|
||||
|
||||
|
||||
// returns partial sums, instead of the final dot product
|
||||
std::string get_dot_1_string() {
|
||||
return R"(
|
||||
@@ -319,13 +339,12 @@ namespace bda
|
||||
const unsigned int num_blocks_per_warp = warpsize/bs/bs;
|
||||
const unsigned int NUM_THREADS = get_global_size(0);
|
||||
const unsigned int num_warps_in_grid = NUM_THREADS / warpsize;
|
||||
unsigned int idx = get_global_id(0);
|
||||
unsigned int target_block_row = idx / warpsize;
|
||||
unsigned int idx_g = get_global_id(0);
|
||||
unsigned int target_block_row = idx_g / warpsize;
|
||||
target_block_row += nodesPerColorPrefix[color];
|
||||
const unsigned int lane = idx_t % warpsize;
|
||||
const unsigned int c = (lane / bs) % bs;
|
||||
const unsigned int r = lane % bs;
|
||||
const double relaxation = 0.9;
|
||||
|
||||
while(target_block_row < nodesPerColorPrefix[color+1]){
|
||||
)";
|
||||
@@ -372,7 +391,7 @@ namespace bda
|
||||
}
|
||||
|
||||
const unsigned int row = target_block_row*bs + lane;
|
||||
x[row] = relaxation * sum;
|
||||
x[row] = sum;
|
||||
}
|
||||
|
||||
target_block_row += num_warps_in_grid;
|
||||
|
||||
@@ -48,6 +48,10 @@ using ilu_decomp_kernel_type = cl::make_kernel<const unsigned int, const unsigne
|
||||
/// a = a + alpha * b
|
||||
std::string get_axpy_string();
|
||||
|
||||
/// Generate string with scale kernel
|
||||
/// a = a * alpha
|
||||
std::string get_scale_string();
|
||||
|
||||
/// returns partial sums, instead of the final dot product
|
||||
/// partial sums are added on CPU
|
||||
std::string get_dot_1_string();
|
||||
|
||||
@@ -509,7 +509,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
|
||||
get_opencl_kernels();
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), ilu_decomp_k.get());
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get(), scale_k.get(), ilu_decomp_k.get());
|
||||
|
||||
} catch (const cl::Error& error) {
|
||||
std::ostringstream oss;
|
||||
@@ -535,6 +535,8 @@ void openclSolverBackend<block_size>::get_opencl_kernels() {
|
||||
cl::Program::Sources sources;
|
||||
std::string axpy_s = get_axpy_string();
|
||||
add_kernel_string(sources, axpy_s);
|
||||
std::string scale_s = get_scale_string();
|
||||
add_kernel_string(sources, scale_s);
|
||||
std::string dot_1_s = get_dot_1_string();
|
||||
add_kernel_string(sources, dot_1_s);
|
||||
std::string norm_s = get_norm_string();
|
||||
@@ -569,6 +571,7 @@ void openclSolverBackend<block_size>::get_opencl_kernels() {
|
||||
dot_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "dot_1")));
|
||||
norm_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "norm")));
|
||||
axpy_k.reset(new cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int>(cl::Kernel(program, "axpy")));
|
||||
scale_k.reset(new cl::make_kernel<cl::Buffer&, const double, const unsigned int>(cl::Kernel(program, "scale")));
|
||||
custom_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int>(cl::Kernel(program, "custom")));
|
||||
spmv_blocked_k.reset(new spmv_kernel_type(cl::Kernel(program, "spmv_blocked")));
|
||||
ILU_apply1_k.reset(new ilu_apply1_kernel_type(cl::Kernel(program, "ILU_apply1")));
|
||||
|
||||
@@ -68,6 +68,7 @@ private:
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, const unsigned int> > scale_k;
|
||||
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const double, const double, const unsigned int> > custom_k;
|
||||
std::unique_ptr<spmv_kernel_type> spmv_blocked_k;
|
||||
std::shared_ptr<ilu_apply1_kernel_type> ILU_apply1_k;
|
||||
@@ -111,6 +112,11 @@ private:
|
||||
/// \param[inout] out output vector
|
||||
void axpy_w(cl::Buffer in, const double a, cl::Buffer out);
|
||||
|
||||
/// Perform scale: vec *= a
|
||||
/// \param[inout] vec vector to scale
|
||||
/// \param[in] a scalar value to multiply vector
|
||||
void scale_w(cl::Buffer vec, const double a);
|
||||
|
||||
/// Custom function that combines scale, axpy and add functions in bicgstab
|
||||
/// p = (p - omega * v) * beta + r
|
||||
/// \param[inout] p output vector
|
||||
|
||||
Reference in New Issue
Block a user