Made reordering optional for WellContributions

Removed unnecessary alloc
This commit is contained in:
tqiu
2021-01-13 11:54:40 +01:00
parent f26e58c6ca
commit 0f9ae3695d
6 changed files with 124 additions and 37 deletions

View File

@@ -57,7 +57,6 @@ namespace bda
delete[] toOrder;
delete[] fromOrder;
}
delete[] LUmat->nnzValues;
}
template <unsigned int block_size>
@@ -130,8 +129,6 @@ namespace bda
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);

View File

@@ -56,11 +56,6 @@ WellContributions::~WellContributions()
if(num_std_wells > 0){
delete[] val_pointers;
#if HAVE_OPENCL
if(opencl_gpu){
delete[] h_toOrder;
}
#endif
}
#if HAVE_OPENCL
@@ -79,8 +74,15 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *kernel_){
void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
{
this->h_toOrder = h_toOrder_;
this->reorder = reorder_;
}
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
@@ -90,29 +92,24 @@ void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffe
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
if (reorder) {
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
} else {
event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
event.wait();
}
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
if(h_x == nullptr){
h_x = new double[N];
h_y = new double[N];
}
if(h_toOrder == nullptr){
h_toOrder = new int[Nb];
}
if(!read_toOrder){
events.resize(1);
queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]);
events[0].wait();
events.clear();
read_toOrder = true;
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
@@ -121,7 +118,7 @@ void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer
// actually apply MultisegmentWells
for(Opm::MultisegmentWellContribution *well: multisegments){
well->setReordering(h_toOrder, true);
well->setReordering(h_toOrder, reorder);
well->apply(h_x, h_y);
}
@@ -138,7 +135,7 @@ void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrd
}
if(num_ms_wells > 0){
apply_mswells(d_x, d_y, d_toOrder);
apply_mswells(d_x, d_y);
}
}
#endif

View File

@@ -95,17 +95,21 @@ private:
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
cl::Context *context;
cl::CommandQueue *queue;
kernel_type *kernel;
kernel_type_no_reorder *kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool read_toOrder = false;
bool reorder = false;
int *h_toOrder = nullptr;
#endif
@@ -150,10 +154,16 @@ public:
#endif
#if HAVE_OPENCL
void setKernel(kernel_type *kernel_);
void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int *toOrder, bool reorder);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
#endif

View File

@@ -440,6 +440,79 @@ namespace bda
}
)";
inline const char* stdwell_apply_no_reorder_s = R"(
__kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = Bcols[b];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
}
z1[r] = localSum[wiId];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
int colIdx = Ccols[bb];
y[colIdx*dim + c] -= temp;
}
}
)";
} // end namespace bda
#endif

View File

@@ -20,7 +20,6 @@
#include <config.h>
#include <cmath>
#include <sstream>
#include <iostream>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
@@ -318,7 +317,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
double norm, norm_0;
if(wellContribs.getNumWells() > 0){
wellContribs.setKernel(stdwell_apply_k.get());
wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
}
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
@@ -479,6 +478,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
program = cl::Program(*context, source);
program.build(devices);
@@ -512,7 +512,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
rb = new double[N];
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
wcontainer->init(wellContribs, N, Nb, reorder);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@@ -529,6 +528,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
stdwell_apply_no_reorder_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
@@ -653,14 +656,16 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
void openclSolverBackend<block_size>::update_system(double *vals, double *b, WellContributions &wellContribs) {
Timer t;
mat->nnzValues = vals;
if (opencl_ilu_reorder != ILUReorder::NONE) {
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
wellContribs.setReordering(toOrder, true);
} else {
rb = b;
wellContribs.setReordering(nullptr, false);
}
if (verbosity > 2) {
@@ -743,13 +748,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
}
}
update_system(vals, b);
update_system(vals, b, wellContribs);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu();
} else {
update_system(vals, b);
update_system(vals, b, wellContribs);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}

View File

@@ -76,6 +76,10 @@ private:
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
@@ -152,7 +156,8 @@ private:
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] b input vectors, contains N values
void update_system(double *vals, double *b);
/// \param[out] wellContribs WellContributions, to set reordering
void update_system(double *vals, double *b, WellContributions &wellContribs);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu();