Merge pull request #2914 from ducbueno/rm-oclcontainer

Removed WellContributionsOCLContainer class
This commit is contained in:
Markus Blatt 2020-11-19 21:27:55 +01:00 committed by GitHub
commit b21b6cf28c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 382 additions and 496 deletions

View File

@ -61,7 +61,6 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributionsOCLContainer.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
endif()
if(MPI_FOUND)
@ -172,7 +171,6 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/openclSolverBackend.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp
opm/simulators/linalg/amgcpr.hh
opm/simulators/linalg/twolevelmethodcpr.hh
opm/simulators/linalg/ExtractParallelGridInformationToISTL.hpp

View File

@ -251,6 +251,12 @@ namespace Opm
if (use_gpu) {
const std::string gpu_mode = EWOMS_GET_PARAM(TypeTag, std::string, GpuMode);
WellContributions wellContribs(gpu_mode);
#if HAVE_OPENCL
if(gpu_mode.compare("opencl") == 0){
const auto openclBackend = static_cast<const bda::openclSolverBackend<block_size>*>(&bdaBridge->getBackend());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
}
#endif
if (!useWellConn_) {
simulator_.problem().wellModel().getWellContributions(wellContribs);
}
@ -301,7 +307,6 @@ namespace Opm
const std::any& parallelInformation() const { return parallelInformation_; }
protected:
// 3x3 matrix block inversion was unstable at least 2.3 until and including
// 2.5.0. There may still be some issue with the 4x4 matrix block inversion
// we therefore still use the block inversion in OPM

View File

@ -222,7 +222,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::get_result(BridgeVector
}
}
#define INSTANTIATE_BDA_FUNCTIONS(n) \
#define INSTANTIATE_BDA_FUNCTIONS(n) \
template BdaBridge<Dune::BCRSMatrix<Opm::MatrixBlock<double, n, n>, std::allocator<Opm::MatrixBlock<double, n, n> > >, \
Dune::BlockVector<Dune::FieldVector<double, n>, std::allocator<Dune::FieldVector<double, n> > >, \
n>::BdaBridge \

View File

@ -47,8 +47,8 @@ template <class BridgeMatrix, class BridgeVector, int block_size>
class BdaBridge
{
private:
std::unique_ptr<bda::BdaSolver<block_size> > backend;
bool use_gpu = false;
std::unique_ptr<bda::BdaSolver<block_size> > backend;
public:
/// Construct a BdaBridge
@ -61,7 +61,6 @@ public:
/// \param[in] opencl_ilu_reorder select either level_scheduling or graph_coloring, see BILU0.hpp for explanation
BdaBridge(std::string gpu_mode, int linear_solver_verbosity, int maxit, double tolerance, unsigned int platformID, unsigned int deviceID, std::string opencl_ilu_reorder);
/// Solve linear system, A*x = b
/// \warning Values of A might get overwritten!
/// \param[in] mat matrix A, should be of type Dune::BCRSMatrix
@ -80,6 +79,10 @@ public:
return use_gpu;
}
const bda::BdaSolver<block_size>& getBackend() const {
return *backend;
}
}; // end class BdaBridge
}

View File

@ -66,7 +66,6 @@ namespace bda
bool initialized = false;
public:
/// Construct a BdaSolver, can be cusparseSolver or openclSolver
/// \param[in] linear_solver_verbosity verbosity of solver
/// \param[in] maxit maximum number of iterations for solver

View File

@ -17,15 +17,13 @@
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h> // CMake
#include <cstdlib>
#include <cstring>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include "opm/simulators/linalg/bda/WellContributions.hpp"
#include <opm/simulators/linalg/bda/WellContributions.hpp>
namespace Opm
{
@ -34,75 +32,180 @@ WellContributions::WellContributions(std::string gpu_mode){
if(gpu_mode.compare("cusparse") == 0){
cuda_gpu = true;
}
if(gpu_mode.compare("opencl") == 0){
else if(gpu_mode.compare("opencl") == 0){
opencl_gpu = true;
}
else{
OPM_THROW(std::logic_error, "Error: invalid GPU mode");
}
}
WellContributions::~WellContributions()
{
#if HAVE_CUDA
// delete MultisegmentWellContributions
for (auto ms : multisegments) {
for (auto ms: multisegments) {
delete ms;
}
multisegments.clear();
#if HAVE_CUDA
if(cuda_gpu){
freeCudaMemory(); // should come before 'delete[] h_x'
}
#endif
if(num_std_wells > 0){
delete[] val_pointers;
#if HAVE_OPENCL
if(opencl_gpu){
delete[] h_toOrder;
}
#endif
}
#if HAVE_OPENCL
if(opencl_gpu){
if(num_ms_wells > 0){
delete[] h_x;
delete[] h_y;
}
}
#endif
}
#if HAVE_OPENCL
void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){
this->context = context_;
this->queue = queue_;
}
void WellContributions::setKernel(kernel_type *kernel_){
this->kernel = kernel_;
}
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*kernel)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl, d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if(h_x == nullptr){
h_x = new double[N];
h_y = new double[N];
}
if(h_toOrder == nullptr){
h_toOrder = new int[Nb];
}
if(!read_toOrder){
events.resize(1);
queue->enqueueReadBuffer(d_toOrder, CL_FALSE, 0, sizeof(int) * Nb, h_toOrder, nullptr, &events[0]);
events[0].wait();
events.clear();
read_toOrder = true;
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
// actually apply MultisegmentWells
for(Opm::MultisegmentWellContribution *well: multisegments){
well->setReordering(h_toOrder, true);
well->apply(h_x, h_y);
}
// copy vector y from CPU to GPU
events.resize(1);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]);
events[0].wait();
events.clear();
}
void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if(num_std_wells > 0){
apply_stdwells(d_x, d_y, d_toOrder);
}
if(num_ms_wells > 0){
apply_mswells(d_x, d_y, d_toOrder);
}
}
#endif
void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
#if HAVE_CUDA
if(cuda_gpu){
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
addMatrixGpu(type, colIndices, values, val_size);
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
if(h_val_pointers_ocl.empty()){
h_val_pointers_ocl.push_back(0);
}
switch (type) {
case MatrixType::C:
h_Ccols_ocl.insert(h_Ccols_ocl.end(), colIndices, colIndices + val_size);
h_Cnnzs_ocl.insert(h_Cnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
events.resize(2);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
break;
case MatrixType::D:
h_Dnnzs_ocl.insert(h_Dnnzs_ocl.end(), values, values + dim_wells * dim_wells);
events.resize(1);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
events[0].wait();
events.clear();
break;
case MatrixType::B:
h_Bcols_ocl.insert(h_Bcols_ocl.end(), colIndices, colIndices + val_size);
h_Bnnzs_ocl.insert(h_Bnnzs_ocl.end(), values, values + val_size * dim * dim_wells);
h_val_pointers_ocl.push_back(h_val_pointers_ocl.back() + val_size);
events.resize(2);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
events.resize(1);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers, nullptr, &events[0]);
events[0].wait();
events.clear();
}
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
}
}
#endif
if(MatrixType::B == type) {
num_blocks_so_far += val_size;
num_std_wells_so_far++;
}
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
#endif
}
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
{
dim = dim_;
@ -115,7 +218,6 @@ void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
}
}
#if HAVE_CUDA
void WellContributions::addNumBlocks(unsigned int numBlocks)
{
if (allocated) {
@ -128,19 +230,36 @@ void WellContributions::addNumBlocks(unsigned int numBlocks)
void WellContributions::alloc()
{
if (num_std_wells > 0) {
allocStandardWells();
val_pointers = new unsigned int[num_std_wells + 1];
#if HAVE_CUDA
if(cuda_gpu){
allocStandardWells();
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
#endif
allocated = true;
}
}
#endif
void WellContributions::addMultisegmentWellContribution(unsigned int dim_, unsigned int dim_wells_,
unsigned int Nb, unsigned int Mb,
unsigned int Nb_, unsigned int Mb,
unsigned int BnumBlocks, std::vector<double> &Bvalues, std::vector<unsigned int> &BcolIndices, std::vector<unsigned int> &BrowPointers,
unsigned int DnumBlocks, double *Dvalues, UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
std::vector<double> &Cvalues)
{
assert(dim==dim_);
this->Nb = Nb_;
this->N = Nb * dim_;
MultisegmentWellContribution *well = new MultisegmentWellContribution(dim_, dim_wells_, Nb, Mb, BnumBlocks, Bvalues, BcolIndices, BrowPointers, DnumBlocks, Dvalues, DcolPointers, DrowIndices, Cvalues);
multisegments.emplace_back(well);

View File

@ -127,16 +127,13 @@ __global__ void apply_well_contributions(
void WellContributions::allocStandardWells()
{
if (num_std_wells > 0) {
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
val_pointers = new unsigned int[num_std_wells + 1];
cudaMalloc((void**)&d_val_pointers, sizeof(int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
void WellContributions::freeCudaMemory() {
@ -147,14 +144,13 @@ void WellContributions::freeCudaMemory() {
cudaFree(d_Bnnzs);
cudaFree(d_Ccols);
cudaFree(d_Bcols);
delete[] val_pointers;
cudaFree(d_val_pointers);
}
if (num_ms_wells > 0 && h_x) {
cudaFreeHost(h_x);
cudaFreeHost(h_y);
h_x = h_y = nullptr; // Mark as free for constructor
cudaFreeHost(h_y);
h_x = h_y = nullptr; // Mark as free for constructor
}
}
@ -200,10 +196,6 @@ void WellContributions::apply(double *d_x, double *d_y)
void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
cudaMemcpy(d_Cnnzs + num_blocks_so_far * dim * dim_wells, values, sizeof(double) * val_size * dim * dim_wells, cudaMemcpyHostToDevice);
@ -218,17 +210,13 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
cudaMemcpy(d_val_pointers, val_pointers, sizeof(unsigned int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
}
cudaMemcpy(d_val_pointers, val_pointers, sizeof(int) * (num_std_wells + 1), cudaMemcpyHostToDevice);
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
}
cudaCheckLastError("WellContributions::addMatrix() failed");
if (MatrixType::B == type) {
num_blocks_so_far += val_size;
num_std_wells_so_far++;
}
}
void WellContributions::setCudaStream(cudaStream_t stream_)

View File

@ -71,30 +71,45 @@ public:
B
};
unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq
unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
std::vector<MultisegmentWellContribution*> multisegments;
#if HAVE_OPENCL
std::vector<double> h_Cnnzs_ocl, h_Dnnzs_ocl, h_Bnnzs_ocl;
std::vector<int> h_Ccols_ocl, h_Bcols_ocl;
std::vector<unsigned int> h_val_pointers_ocl;
#endif
private:
bool opencl_gpu = false;
bool cuda_gpu = false;
bool allocated = false;
unsigned int N; // number of rows (not blockrows) in vectors x and y
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
#if HAVE_CUDA
bool allocated = false;
unsigned int Nb; // number of blockrows in vectors x and y
unsigned int dim; // number of columns in blocks in B and C, equal to StandardWell::numEq
unsigned int dim_wells; // number of rows in blocks in B and C, equal to StandardWell::numStaticWellEq
unsigned int num_blocks = 0; // total number of blocks in all wells
unsigned int num_std_wells = 0; // number of StandardWells in this object
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
unsigned int num_blocks_so_far = 0; // keep track of where next data is written
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
unsigned int *val_pointers = nullptr; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
double *h_x = nullptr;
double *h_y = nullptr;
std::vector<MultisegmentWellContribution*> multisegments;
#if HAVE_OPENCL
typedef cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> kernel_type;
cl::Context *context;
cl::CommandQueue *queue;
kernel_type *kernel;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool read_toOrder = false;
int *h_toOrder = nullptr;
#endif
#if HAVE_CUDA
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
@ -106,8 +121,12 @@ private:
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
double *h_x = nullptr;
double *h_y = nullptr;
/// Allocate GPU memory for StandardWells
void allocStandardWells();
/// Free GPU memory allocated with cuda.
void freeCudaMemory();
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
@ -115,12 +134,6 @@ private:
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
/// Allocate GPU memory for StandardWells
void allocStandardWells();
/// Free GPU memory allocated with cuda.
void freeCudaMemory();
#endif
public:
@ -134,6 +147,15 @@ public:
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
#endif
#if HAVE_OPENCL
void setKernel(kernel_type *kernel_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
#endif
unsigned int getNumWells(){
return num_std_wells + num_ms_wells;
@ -145,7 +167,6 @@ public:
/// Allocate memory for the StandardWells
void alloc();
#endif
/// Create a new WellContributions
WellContributions(std::string gpu_mode);
@ -158,7 +179,6 @@ public:
/// \param[in] dim_wells number of rows
void setBlockSize(unsigned int dim, unsigned int dim_wells);
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
@ -188,7 +208,6 @@ public:
UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
std::vector<double> &Cvalues);
};
} //namespace Opm
#endif

View File

@ -1,176 +0,0 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
namespace bda
{
using Opm::OpmLog;
using Dune::Timer;
void WellContributionsOCLContainer::init(Opm::WellContributions &wellContribs, int N_, int Nb_){
N = N_;
Nb = Nb_;
dim = wellContribs.dim;
dim_wells = wellContribs.dim_wells;
if(!wellContribs.h_val_pointers_ocl.empty()){
num_blocks = wellContribs.h_Ccols_ocl.size();
num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
s.toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
}
}
void WellContributionsOCLContainer::reinit(Opm::WellContributions &wellContribs){
num_blocks = wellContribs.h_Ccols_ocl.size();
num_std_wells = wellContribs.h_val_pointers_ocl.size() - 1;
s.Cnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Cnnzs_ocl.size());
s.Dnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Dnnzs_ocl.size());
s.Bnnzs = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(double) * wellContribs.h_Bnnzs_ocl.size());
s.Ccols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Ccols_ocl.size());
s.Bcols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * wellContribs.h_Bcols_ocl.size());
s.val_pointers = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size());
}
void WellContributionsOCLContainer::copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_){
toOrder.insert(toOrder.end(), toOrder_, toOrder_ + Nb);
if(num_std_wells > 0){
cl::Event event;
std::vector<cl::Event> events(7);
queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]);
queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]);
queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
queue->enqueueWriteBuffer(s.toOrder, CL_FALSE, 0, sizeof(int) * toOrder.size(), toOrder.data(), nullptr, &events[6]);
event.waitForEvents(events);
}
if(!wellContribs.multisegments.empty()){
multisegments = std::move(wellContribs.multisegments);
num_ms_wells = multisegments.size();
x_msw.reserve(N);
y_msw.reserve(N);
}
}
void WellContributionsOCLContainer::update_on_gpu(Opm::WellContributions &wellContribs){
if(num_std_wells > 0){
if(num_std_wells != wellContribs.h_val_pointers_ocl.size() || num_blocks != wellContribs.h_Ccols_ocl.size()){
reinit(wellContribs);
}
cl::Event event;
std::vector<cl::Event> events(6);
queue->enqueueWriteBuffer(s.Cnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Cnnzs_ocl.size(), wellContribs.h_Cnnzs_ocl.data(), nullptr, &events[0]);
queue->enqueueWriteBuffer(s.Dnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Dnnzs_ocl.size(), wellContribs.h_Dnnzs_ocl.data(), nullptr, &events[1]);
queue->enqueueWriteBuffer(s.Bnnzs, CL_FALSE, 0, sizeof(double) * wellContribs.h_Bnnzs_ocl.size(), wellContribs.h_Bnnzs_ocl.data(), nullptr, &events[2]);
queue->enqueueWriteBuffer(s.Ccols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Ccols_ocl.size(), wellContribs.h_Ccols_ocl.data(), nullptr, &events[3]);
queue->enqueueWriteBuffer(s.Bcols, CL_FALSE, 0, sizeof(int) * wellContribs.h_Bcols_ocl.size(), wellContribs.h_Bcols_ocl.data(), nullptr, &events[4]);
queue->enqueueWriteBuffer(s.val_pointers, CL_FALSE, 0, sizeof(unsigned int) * wellContribs.h_val_pointers_ocl.size(), wellContribs.h_val_pointers_ocl.data(), nullptr, &events[5]);
event.waitForEvents(events);
}
if(!wellContribs.multisegments.empty()){
multisegments = std::move(wellContribs.multisegments);
}
}
void WellContributionsOCLContainer::setOpenCLContext(cl::Context *context_){
this->context = context_;
}
void WellContributionsOCLContainer::setOpenCLQueue(cl::CommandQueue *queue_){
this->queue = queue_;
}
void WellContributionsOCLContainer::setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_){
this->stdwell_apply = stdwell_apply_;
}
void WellContributionsOCLContainer::applyStdWells(cl::Buffer& x, cl::Buffer& y){
const unsigned int work_group_size = 32;
const unsigned int total_work_items = num_std_wells * work_group_size;
const unsigned int lmem1 = sizeof(double) * work_group_size;
const unsigned int lmem2 = sizeof(double) * dim_wells;
cl::Event event;
event = (*stdwell_apply)(cl::EnqueueArgs(*queue, cl::NDRange(total_work_items), cl::NDRange(work_group_size)),
s.Cnnzs, s.Dnnzs, s.Bnnzs, s.Ccols, s.Bcols, x, y, s.toOrder, dim, dim_wells, s.val_pointers,
cl::Local(lmem1), cl::Local(lmem2), cl::Local(lmem2));
}
void WellContributionsOCLContainer::applyMSWells(cl::Buffer& x, cl::Buffer& y) {
cl::Event event;
std::vector<cl::Event> events(2);
// copy vectors x and y from GPU to CPU
queue->enqueueReadBuffer(x, CL_FALSE, 0, sizeof(double) * N, x_msw.data(), nullptr, &events[0]);
queue->enqueueReadBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &events[1]);
event.waitForEvents(events);
// actually apply MultisegmentWells
for(auto well: multisegments){
well->setReordering(toOrder.data(), true);
well->apply(x_msw.data(), y_msw.data());
}
// copy vector y from CPU to GPU
queue->enqueueWriteBuffer(y, CL_FALSE, 0, sizeof(double) * N, y_msw.data(), nullptr, &event);
event.wait();
}
void WellContributionsOCLContainer::apply(cl::Buffer& x, cl::Buffer& y){
if(num_std_wells > 0){
applyStdWells(x, y);
}
if(num_ms_wells > 0){
applyMSWells(x, y);
}
}
WellContributionsOCLContainer::~WellContributionsOCLContainer(){
if(num_ms_wells > 0){
for (auto ms : multisegments) {
delete ms;
}
}
}
} // end namespace bda

View File

@ -1,78 +0,0 @@
/*
Copyright 2019 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONSOCLCONTAINER_HPP
#define WELLCONTRIBUTIONSOCLCONTAINER_HPP
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/MultisegmentWellContribution.hpp>
namespace bda
{
class WellContributionsOCLContainer
{
private:
int N, Nb;
unsigned int dim, dim_wells;
unsigned int num_blocks = 0;
unsigned int num_std_wells = 0;
unsigned int num_ms_wells = 0; // number of MultisegmentWells in this object, must equal multisegments.size()
std::vector<int> toOrder;
std::vector<double> x_msw, y_msw;
std::vector<Opm::MultisegmentWellContribution*> multisegments;
typedef struct {
cl::Buffer Cnnzs, Dnnzs, Bnnzs;
cl::Buffer Ccols, Bcols;
cl::Buffer val_pointers, toOrder;
} GPU_storage;
GPU_storage s;
cl::Context *context;
cl::CommandQueue *queue;
cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply;
void reinit(Opm::WellContributions &wellContribs);
void applyStdWells(cl::Buffer& x, cl::Buffer& y);
void applyMSWells(cl::Buffer& x, cl::Buffer& y);
public:
WellContributionsOCLContainer() {}
~WellContributionsOCLContainer();
WellContributionsOCLContainer(const WellContributionsOCLContainer&) = delete;
WellContributionsOCLContainer& operator=(const WellContributionsOCLContainer&) = delete;
void apply(cl::Buffer& x, cl::Buffer& y);
void init(Opm::WellContributions &wellContribs, int N, int Nb);
void copy_to_gpu(Opm::WellContributions &wellContribs, int *toOrder_);
void update_on_gpu(Opm::WellContributions &wellContribs);
void setOpenCLContext(cl::Context *context);
void setOpenCLQueue(cl::CommandQueue *queue);
void setKernel(cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
cl::Buffer&, cl::Buffer&, cl::Buffer&, cl::Buffer&,
const unsigned int, const unsigned int, cl::Buffer&,
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> *stdwell_apply_);
};
} // end namespace bda
#endif

View File

@ -46,7 +46,142 @@ using Dune::Timer;
template <unsigned int block_size>
openclSolverBackend<block_size>::openclSolverBackend(int verbosity_, int maxit_, double tolerance_, unsigned int platformID_, unsigned int deviceID_, ILUReorder opencl_ilu_reorder) : BdaSolver<block_size>(verbosity_, maxit_, tolerance_, platformID_, deviceID_) {
prec = new Preconditioner(opencl_ilu_reorder, verbosity_);
wcontainer = new WContainer();
cl_int err = CL_SUCCESS;
std::ostringstream out;
try {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.size() == 0) {
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
}
out << "Found " << platforms.size() << " OpenCL platforms" << "\n";
if (verbosity >= 1) {
std::string platform_info;
for (unsigned int i = 0; i < platforms.size(); ++i) {
platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info);
out << "Platform name : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info);
out << "Platform vendor : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info);
out << "Platform version : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info);
out << "Platform profile : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
out << "Platform extensions: " << platform_info << "\n\n";
}
}
OpmLog::info(out.str());
out.str("");
out.clear();
if (platforms.size() <= platformID) {
OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
} else {
std::string platform_info;
out << "Chosen:\n";
platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
out << "Platform name : " << platform_info << "\n";
platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
out << "Platform version : " << platform_info << "\n";
OpmLog::info(out.str());
out.str("");
out.clear();
}
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0};
context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
devices = context->getInfo<CL_CONTEXT_DEVICES>();
if (devices.size() == 0){
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
}
out << "Found " << devices.size() << " OpenCL devices" << "\n";
if (verbosity >= 1) {
for (unsigned int i = 0; i < devices.size(); ++i) {
std::string device_info;
std::vector<size_t> work_sizes;
std::vector<cl_device_partition_property> partitions;
devices[i].getInfo(CL_DEVICE_NAME, &device_info);
out << "CL_DEVICE_NAME : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
out << "CL_DEVICE_VENDOR : " << device_info << "\n";
devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
out << "CL_DRIVER_VERSION : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
out << "CL_DEVICE_PROFILE : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
out << "CL_DEVICE_EXTENSIONS : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
for (unsigned int j = 0; j < work_sizes.size(); ++j) {
out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
}
devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
for (unsigned int j = 0; j < partitions.size(); ++j) {
out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
}
partitions.clear();
devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
for (unsigned int j = 0; j < partitions.size(); ++j) {
out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
}
// C-style properties
cl_device_id tmp_id = devices[i]();
cl_ulong size;
clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_LOCAL_MEM_SIZE : " << size / 1024 << " KB\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_MAX_COMPUTE_UNITS : " << size << "\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << size / 1024 / 1024 << " MB\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << size << "\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n\n";
}
}
OpmLog::info(out.str());
out.str("");
out.clear();
if (devices.size() <= deviceID){
OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID");
} else {
std::string device_info;
out << "Chosen:\n";
devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info);
out << "CL_DEVICE_NAME : " << device_info << "\n";
devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info);
out << "CL_DEVICE_VERSION : " << device_info << "\n";
OpmLog::info(out.str());
out.str("");
out.clear();
}
cl::Event event;
queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "OpenCL Error: " << error.what() << "(" << error.err() << ")\n";
oss << getErrorString(error.err());
// rethrow exception
OPM_THROW(std::logic_error, oss.str());
} catch (const std::logic_error& error) {
// rethrow exception by OPM_THROW in the try{}, without this, a segfault occurs
throw error;
}
}
@ -177,11 +312,15 @@ void openclSolverBackend<block_size>::spmv_blocked_w(cl::Buffer vals, cl::Buffer
}
template <unsigned int block_size>
void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res) {
float it;
double rho, rhop, beta, alpha, omega, tmp1, tmp2;
double norm, norm_0;
if(wellContribs.getNumWells() > 0){
wellContribs.setKernel(stdwell_apply_k.get());
}
Timer t_total, t_prec(false), t_spmv(false), t_well(false), t_rest(false);
// set r to the initial residual
@ -236,7 +375,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
// apply wellContributions
t_well.start();
wcontainer->apply(d_pw, d_v);
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_pw, d_v, d_toOrder);
}
t_well.stop();
t_rest.start();
@ -265,7 +406,9 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
// apply wellContributions
t_well.start();
wcontainer->apply(d_s, d_t);
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_s, d_t, d_toOrder);
}
t_well.stop();
t_rest.start();
@ -313,7 +456,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(BdaResult& res) {
template <unsigned int block_size>
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs) {
void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, double *vals, int *rows, int *cols) {
this->N = N_;
this->nnz = nnz_;
this->nnzb = nnz_ / block_size / block_size;
@ -327,127 +470,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
out.str("");
out.clear();
cl_int err = CL_SUCCESS;
try {
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
if (platforms.size() == 0) {
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL platforms are found");
}
out << "Found " << platforms.size() << " OpenCL platforms" << "\n";
if (verbosity >= 1) {
std::string platform_info;
for (unsigned int i = 0; i < platforms.size(); ++i) {
platforms[i].getInfo(CL_PLATFORM_NAME, &platform_info);
out << "Platform name : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_VENDOR, &platform_info);
out << "Platform vendor : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_VERSION, &platform_info);
out << "Platform version : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_PROFILE, &platform_info);
out << "Platform profile : " << platform_info << "\n";
platforms[i].getInfo(CL_PLATFORM_EXTENSIONS, &platform_info);
out << "Platform extensions: " << platform_info << "\n\n";
}
}
OpmLog::info(out.str());
out.str("");
out.clear();
if (platforms.size() <= platformID) {
OPM_THROW(std::logic_error, "Error chosen too high OpenCL platform ID");
} else {
std::string platform_info;
out << "Chosen:\n";
platforms[platformID].getInfo(CL_PLATFORM_NAME, &platform_info);
out << "Platform name : " << platform_info << "\n";
platforms[platformID].getInfo(CL_PLATFORM_VERSION, &platform_info);
out << "Platform version : " << platform_info << "\n";
OpmLog::info(out.str());
out.str("");
out.clear();
}
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[platformID])(), 0};
context.reset(new cl::Context(CL_DEVICE_TYPE_GPU, properties));
std::vector<cl::Device> devices = context->getInfo<CL_CONTEXT_DEVICES>();
if (devices.size() == 0){
OPM_THROW(std::logic_error, "Error openclSolver is selected but no OpenCL devices are found");
}
out << "Found " << devices.size() << " OpenCL devices" << "\n";
if (verbosity >= 1) {
for (unsigned int i = 0; i < devices.size(); ++i) {
std::string device_info;
std::vector<size_t> work_sizes;
std::vector<cl_device_partition_property> partitions;
devices[i].getInfo(CL_DEVICE_NAME, &device_info);
out << "CL_DEVICE_NAME : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_VENDOR, &device_info);
out << "CL_DEVICE_VENDOR : " << device_info << "\n";
devices[i].getInfo(CL_DRIVER_VERSION, &device_info);
out << "CL_DRIVER_VERSION : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_BUILT_IN_KERNELS, &device_info);
out << "CL_DEVICE_BUILT_IN_KERNELS: " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_PROFILE, &device_info);
out << "CL_DEVICE_PROFILE : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_OPENCL_C_VERSION, &device_info);
out << "CL_DEVICE_OPENCL_C_VERSION: " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_EXTENSIONS, &device_info);
out << "CL_DEVICE_EXTENSIONS : " << device_info << "\n";
devices[i].getInfo(CL_DEVICE_MAX_WORK_ITEM_SIZES, &work_sizes);
for (unsigned int j = 0; j < work_sizes.size(); ++j) {
out << "CL_DEVICE_MAX_WORK_ITEM_SIZES[" << j << "]: " << work_sizes[j] << "\n";
}
devices[i].getInfo(CL_DEVICE_PARTITION_PROPERTIES, &partitions);
for (unsigned int j = 0; j < partitions.size(); ++j) {
out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
}
partitions.clear();
devices[i].getInfo(CL_DEVICE_PARTITION_TYPE, &partitions);
for (unsigned int j = 0; j < partitions.size(); ++j) {
out << "CL_DEVICE_PARTITION_PROPERTIES[" << j << "]: " << partitions[j] << "\n";
}
// C-style properties
cl_device_id tmp_id = devices[i]();
cl_ulong size;
clGetDeviceInfo(tmp_id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_LOCAL_MEM_SIZE : " << size / 1024 << " KB\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_MAX_COMPUTE_UNITS : " << size << "\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_MAX_MEM_ALLOC_SIZE : " << size / 1024 / 1024 << " MB\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_MAX_WORK_GROUP_SIZE : " << size << "\n";
clGetDeviceInfo(tmp_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &size, 0);
out << "CL_DEVICE_GLOBAL_MEM_SIZE : " << size / 1024 / 1024 / 1024 << " GB\n\n";
}
}
OpmLog::info(out.str());
out.str("");
out.clear();
if (devices.size() <= deviceID){
OPM_THROW(std::logic_error, "Error chosen too high OpenCL device ID");
} else {
std::string device_info;
out << "Chosen:\n";
devices[deviceID].getInfo(CL_DEVICE_NAME, &device_info);
out << "CL_DEVICE_NAME : " << device_info << "\n";
devices[deviceID].getInfo(CL_DEVICE_VERSION, &device_info);
out << "CL_DEVICE_VERSION : " << device_info << "\n";
OpmLog::info(out.str());
out.str("");
out.clear();
}
cl::Program::Sources source(1, std::make_pair(axpy_s, strlen(axpy_s))); // what does this '1' mean? cl::Program::Sources is of type 'std::vector<std::pair<const char*, long unsigned int> >'
source.emplace_back(std::make_pair(dot_1_s, strlen(dot_1_s)));
source.emplace_back(std::make_pair(norm_s, strlen(norm_s)));
@ -457,16 +480,10 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
source.emplace_back(std::make_pair(ILU_apply2_s, strlen(ILU_apply2_s)));
source.emplace_back(std::make_pair(stdwell_apply_s, strlen(stdwell_apply_s)));
program = cl::Program(*context, source);
program.build(devices);
cl::Event event;
queue.reset(new cl::CommandQueue(*context, devices[deviceID], 0, &err));
prec->setOpenCLContext(context.get());
prec->setOpenCLQueue(queue.get());
wcontainer->setOpenCLContext(context.get());
wcontainer->setOpenCLQueue(queue.get());
rb = new double[N];
tmp = new double[N];
@ -491,7 +508,7 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
d_Acols = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * nnzb);
d_Arows = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * (Nb + 1));
wcontainer->init(wellContribs, N, Nb);
d_toOrder = cl::Buffer(*context, CL_MEM_READ_WRITE, sizeof(int) * Nb);
// queue.enqueueNDRangeKernel() is a blocking/synchronous call, at least for NVIDIA
// cl::make_kernel<> myKernel(); myKernel(args, arg1, arg2); is also blocking
@ -510,7 +527,6 @@ void openclSolverBackend<block_size>::initialize(int N_, int nnz_, int dim, doub
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg>(cl::Kernel(program, "stdwell_apply")));
prec->setKernels(ILU_apply1_k.get(), ILU_apply2_k.get());
wcontainer->setKernel(stdwell_apply_k.get());
} catch (const cl::Error& error) {
std::ostringstream oss;
@ -534,12 +550,10 @@ void openclSolverBackend<block_size>::finalize() {
delete[] vals_contiguous;
#endif
delete prec;
delete wcontainer;
} // end finalize()
template <unsigned int block_size>
void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &wellContribs) {
void openclSolverBackend<block_size>::copy_system_to_gpu() {
Timer t;
cl::Event event;
@ -558,11 +572,10 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
queue->enqueueWriteBuffer(d_Acols, CL_TRUE, 0, sizeof(int) * nnzb, rmat->colIndices);
queue->enqueueWriteBuffer(d_Arows, CL_TRUE, 0, sizeof(int) * (Nb + 1), rmat->rowPointers);
queue->enqueueWriteBuffer(d_b, CL_TRUE, 0, sizeof(double) * N, rb);
queue->enqueueWriteBuffer(d_toOrder, CL_TRUE, 0, sizeof(int) * Nb, toOrder);
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wcontainer->copy_to_gpu(wellContribs, toOrder);
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::copy_system_to_gpu(): " << t.stop() << " s";
@ -572,7 +585,7 @@ void openclSolverBackend<block_size>::copy_system_to_gpu(WellContributions &well
// don't copy rowpointers and colindices, they stay the same
template <unsigned int block_size>
void openclSolverBackend<block_size>::update_system_on_gpu(WellContributions &wellContribs) {
void openclSolverBackend<block_size>::update_system_on_gpu() {
Timer t;
cl::Event event;
@ -592,8 +605,6 @@ void openclSolverBackend<block_size>::update_system_on_gpu(WellContributions &we
queue->enqueueFillBuffer(d_x, 0, 0, sizeof(double) * N, nullptr, &event);
event.wait();
wcontainer->update_on_gpu(wellContribs);
if (verbosity > 2) {
std::ostringstream out;
out << "openclSolver::update_system_on_gpu(): " << t.stop() << " s";
@ -660,12 +671,12 @@ bool openclSolverBackend<block_size>::create_preconditioner() {
template <unsigned int block_size>
void openclSolverBackend<block_size>::solve_system(BdaResult &res) {
void openclSolverBackend<block_size>::solve_system(WellContributions &wellContribs, BdaResult &res) {
Timer t;
// actually solve
try {
gpu_pbicgstab(res);
gpu_pbicgstab(wellContribs, res);
} catch (const cl::Error& error) {
std::ostringstream oss;
oss << "openclSolverBackend::solve_system error: " << error.what() << "(" << error.err() << ")\n";
@ -706,7 +717,7 @@ void openclSolverBackend<block_size>::get_result(double *x) {
template <unsigned int block_size>
SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int dim, double *vals, int *rows, int *cols, double *b, WellContributions& wellContribs, BdaResult &res) {
if (initialized == false) {
initialize(N_, nnz_, dim, vals, rows, cols, wellContribs);
initialize(N_, nnz_, dim, vals, rows, cols);
if (analysis_done == false) {
if (!analyse_matrix()) {
return SolverStatus::BDA_SOLVER_ANALYSIS_FAILED;
@ -716,15 +727,15 @@ SolverStatus openclSolverBackend<block_size>::solve_system(int N_, int nnz_, int
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
copy_system_to_gpu(wellContribs);
copy_system_to_gpu();
} else {
update_system(vals, b);
if (!create_preconditioner()) {
return SolverStatus::BDA_SOLVER_CREATE_PRECONDITIONER_FAILED;
}
update_system_on_gpu(wellContribs);
update_system_on_gpu();
}
solve_system(res);
solve_system(wellContribs, res);
return SolverStatus::BDA_SOLVER_SUCCESS;
}

View File

@ -25,9 +25,10 @@
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/WellContributionsOCLContainer.hpp>
#include <opm/simulators/linalg/bda/BILU0.hpp>
#include <tuple>
namespace bda
{
@ -37,7 +38,6 @@ class openclSolverBackend : public BdaSolver<block_size>
{
typedef BdaSolver<block_size> Base;
typedef BILU0<block_size> Preconditioner;
typedef WellContributionsOCLContainer WContainer;
using Base::N;
using Base::Nb;
@ -54,19 +54,17 @@ private:
double *rb = nullptr; // reordered b vector, the matrix is reordered, so b must also be
double *vals_contiguous = nullptr; // only used if COPY_ROW_BY_ROW is true in openclSolverBackend.cpp
bool analysis_done = false;
// OpenCL variables must be reusable, they are initialized in initialize()
cl::Buffer d_Avals, d_Acols, d_Arows; // (reordered) matrix in BSR format on GPU
cl::Buffer d_x, d_b, d_rb, d_r, d_rw, d_p; // vectors, used during linear solve
cl::Buffer d_pw, d_s, d_t, d_v; // vectors, used during linear solve
cl::Buffer d_tmp; // used as tmp GPU buffer for dot() and norm()
cl::Buffer d_toOrder;
double *tmp = nullptr; // used as tmp CPU buffer for dot() and norm()
// shared pointers are also passed to other objects
std::vector<cl::Device> devices;
cl::Program program;
std::shared_ptr<cl::Context> context;
std::shared_ptr<cl::CommandQueue> queue;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > dot_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, cl::Buffer&, const unsigned int, cl::LocalSpaceArg> > norm_k;
std::unique_ptr<cl::make_kernel<cl::Buffer&, const double, cl::Buffer&, const unsigned int> > axpy_k;
@ -80,8 +78,8 @@ private:
cl::LocalSpaceArg, cl::LocalSpaceArg, cl::LocalSpaceArg> > stdwell_apply_k;
Preconditioner *prec = nullptr; // only supported preconditioner is BILU0
WContainer *wcontainer = nullptr;
int *toOrder = nullptr, *fromOrder = nullptr; // BILU0 reorders rows of the matrix via these mappings
bool analysis_done = false;
std::unique_ptr<BlockedMatrix<block_size> > mat = nullptr; // original matrix
BlockedMatrix<block_size> *rmat = nullptr; // reordered matrix, used for spmv
@ -133,7 +131,7 @@ private:
/// Solve linear system using ilu0-bicgstab
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void gpu_pbicgstab(BdaResult& res);
void gpu_pbicgstab(WellContributions& wellContribs, BdaResult& res);
/// Initialize GPU and allocate memory
/// \param[in] N number of nonzeroes, divide by dim*dim to get number of blocks
@ -142,13 +140,13 @@ private:
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
/// \param[in] rows array of rowPointers, contains N/dim+1 values
/// \param[in] cols array of columnIndices, contains nnz values
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols, WellContributions &wellContribs);
void initialize(int N, int nnz, int dim, double *vals, int *rows, int *cols);
/// Clean memory
void finalize();
/// Copy linear system to GPU
void copy_system_to_gpu(WellContributions &wellContribs);
void copy_system_to_gpu();
/// Reorder the linear system so it corresponds with the coloring
/// \param[in] vals array of nonzeroes, each block is stored row-wise and contiguous, contains nnz values
@ -156,7 +154,7 @@ private:
void update_system(double *vals, double *b);
/// Update linear system on GPU, don't copy rowpointers and colindices, they stay the same
void update_system_on_gpu(WellContributions &wellContribs);
void update_system_on_gpu();
/// Analyse sparsity pattern to extract parallelism
/// \return true iff analysis was successful
@ -169,9 +167,11 @@ private:
/// Solve linear system
/// \param[in] wellContribs WellContributions, to apply them separately, instead of adding them to matrix A
/// \param[inout] res summary of solver result
void solve_system(BdaResult &res);
void solve_system(WellContributions &wellContribs, BdaResult &res);
public:
std::shared_ptr<cl::Context> context;
std::shared_ptr<cl::CommandQueue> queue;
/// Construct a openclSolver
/// \param[in] linear_solver_verbosity verbosity of openclSolver

View File

@ -951,7 +951,6 @@ namespace Opm {
// prepare for StandardWells
wellContribs.setBlockSize(StandardWell<TypeTag>::numEq, StandardWell<TypeTag>::numStaticWellEq);
#if HAVE_CUDA
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];
std::shared_ptr<StandardWell<TypeTag> > derived = std::dynamic_pointer_cast<StandardWell<TypeTag> >(well);
@ -964,7 +963,6 @@ namespace Opm {
// allocate memory for data from StandardWells
wellContribs.alloc();
#endif
for(unsigned int i = 0; i < well_container_.size(); i++){
auto& well = well_container_[i];