mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Made reordering optional for WellContributions
Removed unnecessary alloc
This commit is contained in:
@@ -57,7 +57,6 @@ namespace bda
|
||||
delete[] toOrder;
|
||||
delete[] fromOrder;
|
||||
}
|
||||
delete[] LUmat->nnzValues;
|
||||
}
|
||||
|
||||
template <unsigned int block_size>
|
||||
@@ -130,8 +129,6 @@ namespace bda
|
||||
Lmat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
Umat = std::make_unique<BlockedMatrix<block_size> >(mat->Nb, (mat->nnzbs - mat->Nb) / 2);
|
||||
|
||||
LUmat->nnzValues = new double[mat->nnzbs * bs * bs];
|
||||
|
||||
s.Lvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Lmat->nnzbs);
|
||||
s.Uvals = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * bs * bs * Umat->nnzbs);
|
||||
s.Lcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Lmat->nnzbs);
|
||||
|
||||
@@ -56,11 +56,6 @@ WellContributions::~WellContributions()
|
||||
|
||||
if(num_std_wells > 0){
|
||||
delete[] val_pointers;
|
||||
#if HAVE_OPENCL
|
||||
if(opencl_gpu){
|
||||
delete[] h_toOrder;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if HAVE_OPENCL
|
||||
@@ -79,8 +74,15 @@ void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *qu
|
||||
this->queue = queue_;
|
||||
}
|
||||
|
||||
void WellContributions::setKernel(kernel_type *kernel_){
|
||||
void WellContributions::setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_){
|
||||
this->kernel = kernel_;
|
||||
this->kernel_no_reorder = kernel_no_reorder_;
|
||||
}
|
||||
|
||||
void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
|
||||
{
|
||||
this->h_toOrder = h_toOrder_;
|
||||
this->reorder = reorder_;
|
||||
}
|
||||
|
||||
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
||||
@@ -90,29 +92,24 @@ void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffe
|
||||
const unsigned int lmem2 = sizeof(double) * dim_wells;
|
||||
|
||||
cl::Event event;
|
||||
if (reorder) {
|
||||
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
|
||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
} else {
|
||||
event = (*kernel_no_reorder)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
|
||||
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, dim, dim_wells, *d_val_pointers_ocl,
|
||||
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
|
||||
}
|
||||
event.wait();
|
||||
}
|
||||
|
||||
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
|
||||
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
|
||||
if(h_x == nullptr){
|
||||
h_x = new double[N];
|
||||
h_y = new double[N];
|
||||
}
|
||||
|
||||
if(h_toOrder == nullptr){
|
||||
h_toOrder = new int[Nb];
|
||||
}
|
||||
|
||||
if(!read_toOrder){
|
||||
events.resize(1);
|
||||
queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]);
|
||||
events[0].wait();
|
||||
events.clear();
|
||||
read_toOrder = true;
|
||||
}
|
||||
|
||||
events.resize(2);
|
||||
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
|
||||
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
|
||||
@@ -121,7 +118,7 @@ void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer
|
||||
|
||||
// actually apply MultisegmentWells
|
||||
for(Opm::MultisegmentWellContribution *well: multisegments){
|
||||
well->setReordering(h_toOrder, true);
|
||||
well->setReordering(h_toOrder, reorder);
|
||||
well->apply(h_x, h_y);
|
||||
}
|
||||
|
||||
@@ -138,7 +135,7 @@ void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrd
|
||||
}
|
||||
|
||||
if(num_ms_wells > 0){
|
||||
apply_mswells(d_x, d_y, d_toOrder);
|
||||
apply_mswells(d_x, d_y);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -95,17 +95,21 @@ private:
|
||||
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
|
||||
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type_no_reorder;
|
||||
|
||||
cl::Context *context;
|
||||
cl::CommandQueue *queue;
|
||||
kernel_type *kernel;
|
||||
kernel_type_no_reorder *kernel_no_reorder;
|
||||
std::vector<cl::Event> events;
|
||||
|
||||
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
|
||||
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
|
||||
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
|
||||
|
||||
bool read_toOrder = false;
|
||||
bool reorder = false;
|
||||
int *h_toOrder = nullptr;
|
||||
#endif
|
||||
|
||||
@@ -150,10 +154,16 @@ public:
|
||||
#endif
|
||||
|
||||
#if HAVE_OPENCL
|
||||
void setKernel(kernel_type *kernel_);
|
||||
void setKernel(kernel_type *kernel_, kernel_type_no_reorder *kernel_no_reorder_);
|
||||
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
|
||||
|
||||
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
|
||||
/// Those indices need to be mapped via toOrder
|
||||
/// \param[in] toOrder array with mappings
|
||||
/// \param[in] reorder whether reordering is actually used or not
|
||||
void setReordering(int *toOrder, bool reorder);
|
||||
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
||||
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
||||
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
|
||||
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
|
||||
#endif
|
||||
|
||||
|
||||
@@ -440,6 +440,79 @@ namespace bda
|
||||
}
|
||||
)";
|
||||
|
||||
inline const char* stdwell_apply_no_reorder_s = R"(
|
||||
__kernel void stdwell_apply_no_reorder(__global const double *Cnnzs,
|
||||
__global const double *Dnnzs,
|
||||
__global const double *Bnnzs,
|
||||
__global const int *Ccols,
|
||||
__global const int *Bcols,
|
||||
__global const double *x,
|
||||
__global double *y,
|
||||
const unsigned int dim,
|
||||
const unsigned int dim_wells,
|
||||
__global const unsigned int *val_pointers,
|
||||
__local double *localSum,
|
||||
__local double *z1,
|
||||
__local double *z2){
|
||||
int wgId = get_group_id(0);
|
||||
int wiId = get_local_id(0);
|
||||
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
|
||||
int valsPerBlock = dim*dim_wells;
|
||||
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
|
||||
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
|
||||
int c = wiId % dim;
|
||||
int r = (wiId/dim) % dim_wells;
|
||||
double temp;
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localSum[wiId] = 0;
|
||||
if(wiId < numActiveWorkItems){
|
||||
int b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
while(b < valSize + val_pointers[wgId]){
|
||||
int colIdx = Bcols[b];
|
||||
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
|
||||
b += numBlocksPerWarp;
|
||||
}
|
||||
|
||||
if(wiId < valsPerBlock){
|
||||
localSum[wiId] += localSum[wiId + valsPerBlock];
|
||||
}
|
||||
|
||||
b = wiId/valsPerBlock + val_pointers[wgId];
|
||||
|
||||
if(c == 0 && wiId < valsPerBlock){
|
||||
for(unsigned int stride = 2; stride > 0; stride >>= 1){
|
||||
localSum[wiId] += localSum[wiId + stride];
|
||||
}
|
||||
z1[r] = localSum[wiId];
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim_wells){
|
||||
temp = 0.0;
|
||||
for(unsigned int i = 0; i < dim_wells; ++i){
|
||||
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
|
||||
}
|
||||
z2[wiId] = temp;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(wiId < dim*valSize){
|
||||
temp = 0.0;
|
||||
int bb = wiId/dim + val_pointers[wgId];
|
||||
for (unsigned int j = 0; j < dim_wells; ++j){
|
||||
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
|
||||
}
|
||||
int colIdx = Ccols[bb];
|
||||
y[colIdx*dim + c] -= temp;
|
||||
}
|
||||
}
|
||||
)";
|
||||
|
||||
} // end namespace bda
|
||||
|
||||
#endif
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include <config.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <opm/common/OpmLog/OpmLog.hpp>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
@@ -318,7 +317,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
|
||||
double norm, norm_0;
|
||||
|
||||
if(wellContribs.getNumWells() > 0){
|
||||
wellContribs.setKernel(stdwell_apply_k.get());
|
||||
wellContribs.setKernel(stdwell_apply_k.get(), stdwell_apply_no_reorder_k.get());
|
||||
}
|
||||
|
||||
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
|
||||
@@ -479,6 +478,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
|
||||
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
|
||||
source.emplace_back(std::make_pair(stdwell_apply_no_reorder_s, strlen(stdwell_apply_no_reorder_s)));
|
||||
program = cl::Program(*context, source);
|
||||
program.build(devices);
|
||||
|
||||
@@ -512,7 +512,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
rb = new double[N];
|
||||
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
|
||||
}
|
||||
wcontainer->init(wellContribs, N, Nb, reorder);
|
||||
|
||||
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
|
||||
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
|
||||
@@ -529,6 +528,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
|
||||
stdwell_apply_no_reorder_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply_no_reorder")));
|
||||
|
||||
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
|
||||
|
||||
@@ -653,14 +656,16 @@ bool openclSolverBackend<block_size>::analyse_matrix() {
|
||||
|
||||
|
||||
template <unsigned int block_size>
|
||||
void openclSolverBackend<block_size>::update_system(double *vals, double *b) {
|
||||
void openclSolverBackend<block_size>::update_system(double *vals, double *b, WellContributions &wellContribs) {
|
||||
Timer t;
|
||||
|
||||
mat->nnzValues = vals;
|
||||
if (opencl_ilu_reorder != ILUReorder::NONE) {
|
||||
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
|
||||
wellContribs.setReordering(toOrder, true);
|
||||
} else {
|
||||
rb = b;
|
||||
wellContribs.setReordering(nullptr, false);
|
||||
}
|
||||
|
||||
if (verbosity > 2) {
|
||||
@@ -743,13 +748,13 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
|
||||
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
|
||||
}
|
||||
}
|
||||
update_system(vals, b);
|
||||
update_system(vals, b, wellContribs);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
copy_system_to_gpu();
|
||||
} else {
|
||||
update_system(vals, b);
|
||||
update_system(vals, b, wellContribs);
|
||||
if (!create_preconditioner()) {
|
||||
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
|
||||
}
|
||||
|
||||
@@ -76,6 +76,10 @@ private:
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
|
||||
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
cl::Buffer&, cl::Buffer&, cl::Buffer&,
|
||||
const unsigned int, const unsigned int, cl::Buffer&,
|
||||
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_no_reorder_k;
|
||||
|
||||
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
|
||||
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
|
||||
@@ -152,7 +156,8 @@ private:
|
||||
/// Reorder the linear system so it corresponds with the coloring
|
||||
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
|
||||
/// \param[in] b input vectors, contains N values
|
||||
void update_system(double *vals, double *b);
|
||||
/// \param[out] wellContribs WellContributions, to set reordering
|
||||
void update_system(double *vals, double *b, WellContributions &wellContribs);
|
||||
|
||||
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
|
||||
void update_system_on_gpu();
|
||||
|
||||
Reference in New Issue
Block a user