Merge pull request #2816 from ducbueno/new-opencl-stdwell

New standard well implementation for the OpenCL backend
This commit is contained in:
Markus Blatt 2020-09-30 09:32:19 +02:00 committed by GitHub
commit c26aefdd5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 374 additions and 327 deletions

View File

@ -60,6 +60,7 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
endif()
if(MPI_FOUND)

View File

@ -40,33 +40,6 @@ WellContributions::WellContributions(std::string gpu_mode){
}
}
void WellContributions::alloc()
{
if (num_std_wells > 0) {
#if HAVE_CUDA
if(cuda_gpu){
allocStandardWells();
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
h_Cnnzs_ocl = new double[num_blocks * dim * dim_wells];
h_Dnnzs_ocl = new double[num_std_wells * dim_wells * dim_wells];
h_Bnnzs_ocl = new double[num_blocks * dim * dim_wells];
h_Ccols_ocl = new int[num_blocks];
h_Bcols_ocl = new int[num_blocks];
val_pointers = new unsigned int[num_std_wells + 1];
allocated = true;
}
#endif
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot allocate on GPU because neither CUDA nor OpenCL were found by cmake");
#endif
}
}
WellContributions::~WellContributions()
{
@ -81,145 +54,44 @@ WellContributions::~WellContributions()
freeCudaMemory(); // should come before 'delete[] h_x'
}
#endif
#if HAVE_OPENCL
if (h_x_ocl) {
delete[] h_x_ocl;
delete[] h_y_ocl;
}
if(opencl_gpu){
if(num_std_wells > 0){
delete[] h_Cnnzs_ocl;
delete[] h_Dnnzs_ocl;
delete[] h_Bnnzs_ocl;
delete[] h_Ccols_ocl;
delete[] h_Bcols_ocl;
delete[] val_pointers;
}
}
#endif
}
#if HAVE_OPENCL
void WellContributions::setOpenCLContext(cl::Context *context_){
this->context = context_;
}
void WellContributions::setOpenCLQueue(cl::CommandQueue *queue_){
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *stdwell_apply_){
this->stdwell_apply = stdwell_apply_;
}
void WellContributions::init(){
if(num_std_wells > 0){
d_Cnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
queue->enqueueWriteBuffer(d_Cnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Cnnzs_ocl);
queue->enqueueWriteBuffer(d_Dnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_std_wells * dim_wells * dim_wells, h_Dnnzs_ocl);
queue->enqueueWriteBuffer(d_Bnnzs_ocl, CL_TRUE, 0, sizeof(double) * num_blocks * dim * dim_wells, h_Bnnzs_ocl);
queue->enqueueWriteBuffer(d_Ccols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Ccols_ocl);
queue->enqueueWriteBuffer(d_Bcols_ocl, CL_TRUE, 0, sizeof(int) * num_blocks, h_Bcols_ocl);
queue->enqueueWriteBuffer(d_val_pointers_ocl, CL_TRUE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers);
}
}
void WellContributions::applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y) {
// apply MultisegmentWells
if (num_ms_wells > 0) {
// allocate pinned memory on host if not yet done
if (h_x_ocl == nullptr) {
h_x_ocl = new double[N];
h_y_ocl = new double[N];
}
// copy vectors x and y from GPU to CPU
queue->enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(double) * N, h_x_ocl);
queue->enqueueReadBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
// actually apply MultisegmentWells
for (MultisegmentWellContribution *well : multisegments) {
well->apply(h_x_ocl, h_y_ocl);
}
// copy vector y from CPU to GPU
queue->enqueueWriteBuffer(d_y, CL_TRUE, 0, sizeof(double) * N, h_y_ocl);
}
}
void WellContributions::applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl, d_Ccols_ocl, d_Bcols_ocl, d_x, d_y, dim, dim_wells,
d_val_pointers_ocl, cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
void WellContributions::apply(cl::Buffer& d_x, cl::Buffer& d_y){
if(num_std_wells > 0){
applyStdWell(d_x, d_y);
}
if(num_ms_wells > 0){
applyMSWell(d_x, d_y);
}
}
#endif
void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
#if HAVE_CUDA
if(cuda_gpu){
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
addMatrixGpu(type, colIndices, values, val_size);
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
switch (type) {
case MatrixType::C:
std::copy(colIndices, colIndices + val_size, h_Ccols_ocl + num_blocks_so_far);
std::copy(values, values + val_size*dim*dim_wells, h_Cnnzs_ocl + num_blocks_so_far*dim*dim_wells);
break;
case MatrixType::D:
std::copy(values, values + dim_wells*dim_wells, h_Dnnzs_ocl + num_std_wells_so_far*dim_wells*dim_wells);
break;
case MatrixType::B:
std::copy(colIndices, colIndices + val_size, h_Bcols_ocl + num_blocks_so_far);
std::copy(values, values + val_size*dim*dim_wells, h_Bnnzs_ocl + num_blocks_so_far*dim*dim_wells);
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if(num_std_wells_so_far == num_std_wells - 1){
val_pointers[num_std_wells] = num_blocks;
}
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
if(h_val_pointers_ocl.empty()){
h_val_pointers_ocl.push_back(0);
}
if (MatrixType::B == type) {
num_blocks_so_far += val_size;
num_std_wells_so_far++;
switch (type) {
case MatrixType::C:
h_Ccols_ocl.insert(h_Ccols_ocl.end(), colIndices, colIndices + val_size);
h_Cnnzs_ocl.insert(h_Cnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
break;
case MatrixType::D:
h_Dnnzs_ocl.insert(h_Dnnzs_ocl.end(), values, values + dim_wells * dim_wells);
break;
case MatrixType::B:
h_Bcols_ocl.insert(h_Bcols_ocl.end(), colIndices, colIndices + val_size);
h_Bnnzs_ocl.insert(h_Bnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
h_val_pointers_ocl.push_back(h_val_pointers_ocl.back() + val_size);
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
}
}
@ -235,8 +107,15 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
{
dim = dim_;
dim_wells = dim_wells_;
if(dim != 3 || dim_wells != 4){
std::ostringstream oss;
oss << "WellContributions::setBlockSize error: dim and dim_wells must be equal to 3 and 4, repectivelly, otherwise the add well contributions kernel won't work.\n";
OPM_THROW(std::logic_error, oss.str());
}
}
#if HAVE_CUDA
void WellContributions::addNumBlocks(unsigned int numBlocks)
{
if (allocated) {
@ -246,6 +125,15 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
num_std_wells++;
}
void WellContributions::alloc()
{
if (num_std_wells > 0) {
allocStandardWells();
allocated = true;
}
}
#endif
void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
unsigned int Nb, unsigned int Mb,
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,

View File

@ -136,7 +136,6 @@ void WellContributions::allocStandardWells()
val_pointers = new unsigned int[num_std_wells + 1];
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
allocated = true;
}
}

View File

@ -55,7 +55,6 @@ namespace Opm
/// - copy data of wellcontributions
class WellContributions
{
public:
/// StandardWell has C, D and B matrices that need to be copied
enum class MatrixType {
@ -64,26 +63,34 @@ public:
B
};
private:
unsigned int num_blocks = 0; // total number of blocks in all wells
unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq
unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
unsigned int num_std_wells = 0; // number of StandardWells in this object
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
unsigned int N; // number of rows (not blockrows) in vectors x and y
bool allocated = false;
std::vector<MultisegmentWellContribution*> multisegments;
#if HAVE_OPENCL
std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
std::vector<unsigned int> h_val_pointers_ocl;
std::vector<double> h_x_ocl, h_y_ocl;
int *toOrder = nullptr;
bool reorder = false;
#endif
private:
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
unsigned int N; // number of rows (not blockrows) in vectors x and y
std::vector<MultisegmentWellContribution*> multisegments;
bool opencl_gpu = false;
bool cuda_gpu = false;
#if HAVE_CUDA
bool allocated = false;
unsigned int num_blocks = 0; // total number of blocks in all wells
unsigned int num_std_wells = 0; // number of StandardWells in this object
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
@ -97,30 +104,7 @@ private:
unsigned int *d_val_pointers = nullptr;
double *h_x = nullptr;
double *h_y = nullptr;
#endif
#if HAVE_OPENCL
double *h_Cnnzs_ocl = nullptr;
double *h_Dnnzs_ocl = nullptr;
double *h_Bnnzs_ocl = nullptr;
int *h_Ccols_ocl = nullptr;
int *h_Bcols_ocl = nullptr;
double *h_x_ocl = nullptr;
double *h_y_ocl = nullptr;
cl::Buffer d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
cl::Buffer d_Ccols_ocl, d_Bcols_ocl, d_val_pointers_ocl;
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int,
cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
kernel_type *stdwell_apply;
cl::Context *context;
cl::CommandQueue *queue;
#endif
#if HAVE_CUDA
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
@ -135,12 +119,11 @@ private:
void freeCudaMemory();
#endif
#if HAVE_OPENCL
void applyStdWell(cl::Buffer& d_x, cl::Buffer& d_y);
void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
#endif
public:
//#if HAVE_OPENCL
// void applyMSWell(cl::Buffer& d_x, cl::Buffer& d_y);
//#endif
#if HAVE_CUDA
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
@ -155,14 +138,13 @@ public:
unsigned int getNumWells(){
return num_std_wells + num_ms_wells;
}
#endif
#if HAVE_OPENCL
void init();
void apply(cl::Buffer& x, cl::Buffer& y);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernel(kernel_type *stdwell_apply);
/// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
/// \param[in] numBlocks number of blocks in C and B of next StandardWell
void addNumBlocks(unsigned int numBlocks);
/// Allocate memory for the StandardWells
void alloc();
#endif
/// Create a new WellContributions
@ -171,18 +153,11 @@ public:
/// Destroy a WellContributions, and free memory
~WellContributions();
/// Allocate memory for the StandardWells
void alloc();
/// Indicate how large the blocks of the StandardWell (C and B) are
/// \param[in] dim number of columns
/// \param[in] dim_wells number of rows
void setBlockSize(unsigned int dim, unsigned int dim_wells);
/// Indicate how large the next StandardWell is, this function cannot be called after alloc() is called
/// \param[in] numBlocks number of blocks in C and B of next StandardWell
void addNumBlocks(unsigned int numBlocks);
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent

View File

@ -0,0 +1,139 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
#include<iostream>
namespace bda
{
using Opm::OpmLog;
using Dune::Timer;
void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int Nb_){
Nb = Nb_;
dim = wellContribs.dim;
dim_wells = wellContribs.dim_wells;
if(!wellContribs.h_val_pointers_ocl.empty()){
num_blocks = wellContribs.h_Ccols_ocl.size();
num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
else{
num_std_wells = 0;
}
}
void WellContributionsOCLContainer::reinit(Opm::WellContributions &wellContribs){
num_blocks = wellContribs.h_Ccols_ocl.size();
num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
}
void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs){
if(num_std_wells > 0){
toOrder.insert(toOrder.end(), wellContribs.toOrder, wellContribs.toOrder + Nb);
cl::Event event;
std::vector<cl::Event> events(7);
queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]);
queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]);
queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
queue->enqueueWriteBuffer(s.toOrder, CL_FALSE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &events[6]);
event.waitForEvents(events);
}
}
void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){
if(num_std_wells > 0){
if(num_std_wells != wellContribs.h_val_pointers_ocl.size() || num_blocks != wellContribs.h_Ccols_ocl.size()){
reinit(wellContribs);
}
cl::Event event;
std::vector<cl::Event> events(6);
queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]);
queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]);
queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
event.waitForEvents(events);
}
}
void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){
this->context = context_;
}
void WellContributionsOCLContainer::setOpenCLQueue(cl::CommandQueue *queue_){
this->queue = queue_;
}
void WellContributionsOCLContainer::setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_){
this->stdwell_apply = stdwell_apply_;
}
void WellContributionsOCLContainer::applyStdWells(cl::Buffer& x, cl::Buffer& y){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, x, y, s.toOrder, dim, dim_wells, s.val_pointers,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){
if(num_std_wells > 0){
applyStdWells(x, y);
}
}
WellContributionsOCLContainer::~WellContributionsOCLContainer(){
toOrder.clear();
}
} // end namespace bda

View File

@ -0,0 +1,72 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONSOCLCONTAINER_HPP
#define WELLCONTRIBUTIONSOCLCONTAINER_HPP
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace bda
{
class WellContributionsOCLContainer
{
private:
unsigned int dim, dim_wells;
unsigned int num_blocks = 0;
unsigned int num_std_wells = 0;
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
int Nb;
std::vector<int> toOrder;
typedef struct {
cl::Buffer Cnnzs, Dnnzs, Bnnzs;
cl::Buffer Ccols, Bcols;
cl::Buffer val_pointers, toOrder;
} GPU_storage;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply;
void reinit(Opm::WellContributions &wellContribs);
void applyStdWells(cl::Buffer& x, cl::Buffer& y);
public:
WellContributionsOCLContainer() {};
~WellContributionsOCLContainer();
void apply(cl::Buffer& x, cl::Buffer& y);
void init(Opm::WellContributions &wellContribs, int Nb);
void copy_to_gpu(Opm::WellContributions &wellContribs);
void update_on_gpu(Opm::WellContributions &wellContribs);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_);
};
} // end namespace bda
#endif

View File

@ -366,96 +366,76 @@ namespace bda
}
)";
inline const char* add_well_contributions_s = R"(
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
void atomicAdd(volatile __global double *val, const double delta){
union{
double f;
ulong i;
} old;
union{
double f;
ulong i;
} new;
do{
old.f = *val;
new.f = old.f + delta;
} while(atom_cmpxchg((volatile __global ulong *)val, old.i, new.i) != old.i);
}
__kernel void add_well_contributions(__global const double *valsC,
__global const double *valsD,
__global const double *valsB,
__global const int *colsC,
__global const int *colsB,
__global const double *x,
__global double *y,
const unsigned int blnc,
const unsigned int blnr,
__global const unsigned int *rowptr,
__local double *localSum,
__local double *z1,
__local double *z2){
inline const char* stdwell_apply_s = R"(
__kernel void stdwell_apply(__global const double *Cnnzs,
__global const double *Dnnzs,
__global const double *Bnnzs,
__global const int *Ccols,
__global const int *Bcols,
__global const double *x,
__global double *y,
__global const int *toOrder,
const unsigned int dim,
const unsigned int dim_wells,
__global const unsigned int *val_pointers,
__local double *localSum,
__local double *z1,
__local double *z2){
int wgId = get_group_id(0);
int wiId = get_local_id(0);
int valSize = rowptr[wgId + 1] - rowptr[wgId];
int valsPerBlock = blnc*blnr;
int numActiveWorkItems = (32/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = 32/valsPerBlock;
int c = wiId % blnc;
int r = (wiId/blnc) % blnr;
int valSize = val_pointers[wgId + 1] - val_pointers[wgId];
int valsPerBlock = dim*dim_wells;
int numActiveWorkItems = (get_local_size(0)/valsPerBlock)*valsPerBlock;
int numBlocksPerWarp = get_local_size(0)/valsPerBlock;
int c = wiId % dim;
int r = (wiId/dim) % dim_wells;
double temp;
barrier(CLK_LOCAL_MEM_FENCE);
localSum[wiId] = 0;
if(wiId < numActiveWorkItems){
int b = wiId/valsPerBlock + rowptr[wgId];
while(b < valSize + rowptr[wgId]){
int colIdx = colsB[b];
localSum[wiId] += valsB[b*blnc*blnr + r*blnc + c]*x[colIdx*blnc + c];
int b = wiId/valsPerBlock + val_pointers[wgId];
while(b < valSize + val_pointers[wgId]){
int colIdx = toOrder[Bcols[b]];
localSum[wiId] += Bnnzs[b*dim*dim_wells + r*dim + c]*x[colIdx*dim + c];
b += numBlocksPerWarp;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
int stride = valsPerBlock;
if(wiId < stride){
localSum[wiId] += localSum[wiId + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(c == 0 && wiId < valsPerBlock){
for(stride = 2; stride > 0; stride /= 2){
localSum[wiId] += localSum[wiId + stride];
if(wiId < valsPerBlock){
localSum[wiId] += localSum[wiId + valsPerBlock];
}
b = wiId/valsPerBlock + val_pointers[wgId];
if(c == 0 && wiId < valsPerBlock){
for(unsigned int stride = 2; stride > 0; stride >>= 1){
localSum[wiId] += localSum[wiId + stride];
}
z1[r] = localSum[wiId];
}
z1[r] = localSum[wiId];
}
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < blnr){
if(wiId < dim_wells){
temp = 0.0;
for(unsigned int i = 0; i < blnr; ++i){
temp += valsD[wgId*blnr*blnr + wiId*blnr + i]*z1[i];
for(unsigned int i = 0; i < dim_wells; ++i){
temp += Dnnzs[wgId*dim_wells*dim_wells + wiId*dim_wells + i]*z1[i];
}
z2[wiId] = temp;
}
barrier(CLK_GLOBAL_MEM_FENCE);
barrier(CLK_LOCAL_MEM_FENCE);
if(wiId < blnc*valSize){
if(wiId < dim*valSize){
temp = 0.0;
int bb = wiId/blnc + rowptr[wgId];
int colIdx = colsC[bb];
for (unsigned int j = 0; j < blnr; ++j){
temp += valsC[bb*blnc*blnr + j*blnc + c]*z2[j];
int bb = wiId/dim + val_pointers[wgId];
for (unsigned int j = 0; j < dim_wells; ++j){
temp += Cnnzs[bb*dim*dim_wells + j*dim + c]*z2[j];
}
atomicAdd(&y[colIdx*blnc + c], temp);
int colIdx = toOrder[Ccols[bb]];
y[colIdx*dim + c] -= temp;
}
}
)";

View File

@ -51,6 +51,7 @@ using Dune::Timer;
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
prec = new Preconditioner(LEVEL_SCHEDULING, GRAPH_COLORING, verbosity_);
wcontainer = new WContainer();
}
@ -181,7 +182,7 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
float it;
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
double norm, norm_0;
@ -208,8 +209,6 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
queue->enqueueCopyBuffer(d_r, d_p, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wellContribs.setReordering(toOrder, true);
norm = norm_w(d_r, d_tmp);
norm_0 = norm;
@ -242,7 +241,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
wellContribs.apply(d_pw, d_v);
wcontainer->apply(d_pw, d_v);
t_well.stop();
t_rest.start();
@ -271,7 +270,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
wellContribs.apply(d_s, d_t);
wcontainer->apply(d_s, d_t);
t_well.stop();
t_rest.start();
@ -319,7 +318,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs) {
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
@ -461,7 +460,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
source.emplace_back(std::make_pair(spmv_blocked_s, strlen(spmv_blocked_s)));
source.emplace_back(std::make_pair(ILU_apply1_s, strlen(ILU_apply1_s)));
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
source.emplace_back(std::make_pair(add_well_contributions_s, strlen(add_well_contributions_s)));
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
program = cl::Program(*context, source);
program.build(devices);
@ -471,6 +470,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
wcontainer->setOpenCLContext(context.get());
wcontainer->setOpenCLQueue(queue.get());
rb = new double[N];
tmp = new double[N];
@ -495,6 +496,8 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
wcontainer->init(wellContribs, Nb);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -506,8 +509,13 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
spmv_blocked_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "spmv_blocked")));
ILU_apply1_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply1")));
ILU_apply2_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg>(cl::Kernel(program, "ILU_apply2")));
stdwell_apply_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
wcontainer->setKernel(stdwell_apply_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
@ -523,16 +531,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
initialized = true;
} // end initialize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize_wellContribs(WellContributions& wellContribs){
add_well_contributions_k.reset(new cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "add_well_contributions")));
wellContribs.setOpenCLContext(context.get());
wellContribs.setOpenCLQueue(queue.get());
wellContribs.init();
wellContribs.setKernel(add_well_contributions_k.get());
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::finalize() {
delete[] rb;
@ -541,11 +539,12 @@ void openclSolverBackend<block_size>::finalize() {
delete[] vals_contiguous;
#endif
delete prec;
delete wcontainer;
} // end finalize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu() {
void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &wellContribs) {
Timer t;
cl::Event event;
@ -567,6 +566,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wellContribs.setReordering(toOrder, true);
wcontainer->copy_to_gpu(wellContribs);
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@ -574,10 +576,9 @@ void openclSolverBackend<block_size>::copy_system_to_gpu() {
}
} // end copy_system_to_gpu()
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu() {
void openclSolverBackend<block_size>::update_system_on_gpu(WellContributions &wellContribs) {
Timer t;
cl::Event event;
@ -597,6 +598,8 @@ void openclSolverBackend<block_size>::update_system_on_gpu() {
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wcontainer->update_on_gpu(wellContribs);
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s";
@ -663,12 +666,12 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::solve_system(WellContributions& wellContribs, BdaResult &res) {
void openclSolverBackend<block_size>::solve_system(BdaResult &res) {
Timer t;
// actually solve
try {
gpu_pbicgstab(wellContribs, res);
gpu_pbicgstab(res);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
@ -709,8 +712,7 @@ void openclSolverBackend<block_size>::get_result(double *x) {
template <unsigned int block_size>
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols);
initialize_wellContribs(wellContribs);
initialize(N_, nnz_, dim, vals, rows, cols, wellContribs);
if (analysis_done == false) {
if (!analyse_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@ -720,16 +722,15 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu();
copy_system_to_gpu(wellContribs);
} else {
update_system(vals, b);
initialize_wellContribs(wellContribs);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
update_system_on_gpu();
update_system_on_gpu(wellContribs);
}
solve_system(wellContribs, res);
solve_system(res);
return SolverStatus::BDA_SOLVER_SUCCESS;
}

View File

@ -21,11 +21,10 @@
#define OPM_OPENCLSOLVER_BACKEND_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp>
namespace bda
@ -35,9 +34,9 @@ namespace bda
template <unsigned int block_size>
class openclSolverBackend : public BdaSolver<block_size>
{
typedef BdaSolver<block_size> Base;
typedef BILU0<block_size> Preconditioner;
typedef WellContributionsOCLContainer WContainer;
using Base::N;
using Base::Nb;
@ -51,7 +50,6 @@ class openclSolverBackend : public BdaSolver<block_size>
using Base::initialized;
private:
double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
@ -64,13 +62,6 @@ private:
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
//unsigned int num_blocks, dim_, dim_wells, num_std_wells;
//unsigned int *h_val_pointers;
//int *h_Ccols, *h_Bcols;
//double *h_Cnnzs, *h_Dnnzs, *h_Bnnzs;
//cl::Buffer d_Cnnzs, d_Dnnzs, d_Bnnzs;
//cl::Buffer d_Ccols, d_Bcols, d_val_pointers;
// shared pointers are also passed to other objects
cl::Program program;
std::shared_ptr<cl::Context> context;
@ -82,14 +73,16 @@ private:
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > spmv_blocked_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply1_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::LocalSpaceArg> > ILU_apply2_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&, cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > add_well_contributions_k;
std::shared_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
WContainer *wcontainer = nullptr;
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix, used for spmv
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix, used for spmv
/// Divide A by B, and round up: return (int)ceil(A/B)
/// \param[in] A dividend
@ -136,12 +129,10 @@ private:
/// \param[out] b output vector
void spmv_blocked_w(cl::Buffer vals, cl::Buffer cols, cl::Buffer rows, cl::Buffer x, cl::Buffer b);
//void add_well_contributions_w(cl::Buffer valsC, cl::Buffer valsD, cl::Buffer valsB, cl::Buffer colsC, cl::Buffer colsB, cl::Buffer x, cl::Buffer y, cl::Buffer val_pointers);
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
void gpu_pbicgstab(BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
@ -150,15 +141,13 @@ private:
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
void initialize_wellContribs(WellContributions& wellContribs);
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs);
/// Clean memory
void finalize();
/// Copy linear system to GPU
void copy_system_to_gpu();
void copy_system_to_gpu(WellContributions &wellContribs);
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@ -166,7 +155,7 @@ private:
void update_system(double *vals, double *b);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu();
void update_system_on_gpu(WellContributions &wellContribs);
/// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful
@ -179,7 +168,7 @@ private:
/// Solve linear system
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(WellContributions& wellContribs, BdaResult &res);
void solve_system(BdaResult &res);
public:

View File

@ -916,6 +916,8 @@ namespace Opm {
{
// prepare for StandardWells
wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
#if HAVE_CUDA
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
@ -928,6 +930,7 @@ namespace Opm {
// allocate memory for data from StandardWells
wellContribs.alloc();
#endif
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];