changed: refactor BDA well contributions

split in API specific classes for Cuda/OpenCL

this to
1) it's cleaner
2) it avoids pulling in openCL code in cuda classes which leads
to clashes between nvidia headers and opencl.hpp

there is still too much API specific things in interface between the
bda components to work through a virtual interface so we still have to cast
to the relevant implementation in various places.
This commit is contained in:
Arne Morten Kvarving 2021-11-11 14:45:12 +01:00
parent 352d31a1e9
commit e25caba8ed
13 changed files with 388 additions and 279 deletions

View File

@ -49,6 +49,7 @@ list (APPEND MAIN_SOURCE_FILES
opm/simulators/linalg/FlexibleSolver6.cpp
opm/simulators/linalg/PropertyTree.cpp
opm/simulators/linalg/setupPropertyTree.cpp
opm/simulators/linalg/bda/WellContributions.cpp
opm/simulators/utils/PartiallySupportedFlowKeywords.cpp
opm/simulators/utils/readDeck.cpp
opm/simulators/utils/UnsupportedFlowKeywords.cpp
@ -93,8 +94,7 @@ list (APPEND MAIN_SOURCE_FILES
if(CUDA_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuWellContributions.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
endif()
@ -106,6 +106,7 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
@ -265,6 +266,7 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/opencl.hpp
opm/simulators/linalg/bda/openclKernels.hpp
opm/simulators/linalg/bda/openclSolverBackend.hpp
opm/simulators/linalg/bda/openclWellContributions.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/amgcpr.hh

View File

@ -262,18 +262,18 @@ namespace Opm
bool use_fpga = bdaBridge->getUseFpga();
if (use_gpu || use_fpga) {
const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
WellContributions wellContribs(accelerator_mode, useWellConn_);
bdaBridge->initWellContributions(wellContribs);
auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
bdaBridge->initWellContributions(*wellContribs);
// the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
#if HAVE_CUDA || HAVE_OPENCL
if (!useWellConn_) {
simulator_.problem().wellModel().getWellContributions(wellContribs);
simulator_.problem().wellModel().getWellContributions(*wellContribs);
}
#endif
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
if (result.converged) {
// get result vector x from non-Dune backend, iff solve was successful
bdaBridge->get_result(x);

View File

@ -35,6 +35,7 @@
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#endif
#if HAVE_FPGA
@ -273,7 +274,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
if(accelerator_mode.compare("opencl") == 0){
#if HAVE_OPENCL
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif

View File

@ -25,178 +25,68 @@
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#ifdef HAVE_OPENCL
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#endif
#ifdef HAVE_CUDA
#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
#endif
namespace Opm
{
#if HAVE_OPENCL
using Opm::Accelerator::OpenclKernels;
#endif
WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
std::unique_ptr<WellContributions>
WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
{
if(accelerator_mode.compare("cusparse") == 0){
cuda_gpu = true;
#if HAVE_CUDA
return std::make_unique<WellContributionsCuda>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
#endif
}
else if(accelerator_mode.compare("opencl") == 0){
opencl_gpu = true;
#if HAVE_OPENCL
return std::make_unique<WellContributionsOCL>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
#endif
}
else if(accelerator_mode.compare("fpga") == 0){
// unused for FPGA, but must be defined to avoid error
return std::make_unique<WellContributions>();
}
else if(accelerator_mode.compare("amgcl") == 0){
if (!useWellConn) {
OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true");
}
return std::make_unique<WellContributions>();
}
else{
OPM_THROW(std::logic_error, "Invalid accelerator mode");
}
}
WellContributions::~WellContributions()
void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
[[maybe_unused]] int* colIndices,
[[maybe_unused]] double* values,
[[maybe_unused]] unsigned int val_size)
{
multisegments.clear();
#if HAVE_CUDA
if(cuda_gpu){
freeCudaMemory(); // should come before 'delete[] h_x'
}
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
#endif
#if HAVE_OPENCL
if(opencl_gpu){
if(num_ms_wells > 0){
delete[] h_x;
delete[] h_y;
}
}
#endif
}
#if HAVE_OPENCL
void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){
this->context = context_;
this->queue = queue_;
}
void WellContributions::setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
{
this->h_toOrder = h_toOrder_;
this->reorder = reorder_;
}
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if (reorder) {
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
} else {
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
}
}
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
if(h_x == nullptr){
h_x = new double[N];
h_y = new double[N];
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
// actually apply MultisegmentWells
for (auto& well : multisegments) {
well->setReordering(h_toOrder, reorder);
well->apply(h_x, h_y);
}
// copy vector y from CPU to GPU
events.resize(1);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]);
events[0].wait();
events.clear();
}
void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if(num_std_wells > 0){
apply_stdwells(d_x, d_y, d_toOrder);
}
if(num_ms_wells > 0){
apply_mswells(d_x, d_y);
}
}
#endif
void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
#if HAVE_CUDA
if(cuda_gpu){
addMatrixGpu(type, colIndices, values, val_size);
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
switch (type) {
case MatrixType::C:
events.resize(2);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
break;
case MatrixType::D:
events.resize(1);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
events[0].wait();
events.clear();
break;
case MatrixType::B:
events.resize(2);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
events.resize(1);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
}
}
#endif
this->APIaddMatrix(type, colIndices, values, val_size);
if(MatrixType::B == type) {
num_blocks_so_far += val_size;
num_std_wells_so_far++;
}
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
#endif
}
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
@ -225,22 +115,7 @@ void WellContributions::alloc()
if (num_std_wells > 0) {
val_pointers.resize(num_std_wells+1);
#if HAVE_CUDA
if(cuda_gpu){
allocStandardWells();
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
#endif
this->APIalloc();
allocated = true;
}
}

View File

@ -20,15 +20,6 @@
#ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_HEADER_INCLUDED
#if HAVE_CUDA
#include <cuda_runtime.h>
#endif
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#endif
#include <memory>
#include <vector>
@ -61,6 +52,8 @@ namespace Opm
class WellContributions
{
public:
static std::unique_ptr<WellContributions> create(const std::string& accelerator_mode, bool useWellConn);
#if DUNE_VERSION_NEWER(DUNE_ISTL, 2, 7)
using UMFPackIndex = SuiteSparse_long;
#else
@ -73,9 +66,7 @@ public:
B
};
private:
bool opencl_gpu = false;
bool cuda_gpu = false;
protected:
bool allocated = false;
unsigned int N; // number of rows (not blockrows) in vectors x and y
@ -88,80 +79,9 @@ private:
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
std::vector<unsigned int> val_pointers; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
double *h_x = nullptr;
double *h_y = nullptr;
std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;
#if HAVE_OPENCL
cl::Context *context;
cl::CommandQueue *queue;
Opm::Accelerator::stdwell_apply_kernel_type *kernel;
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool reorder = false;
int *h_toOrder = nullptr;
#endif
#if HAVE_CUDA
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
/// Allocate GPU memory for StandardWells
void allocStandardWells();
/// Free GPU memory allocated with cuda.
void freeCudaMemory();
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
#endif
public:
#if HAVE_CUDA
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
void setCudaStream(cudaStream_t stream);
/// Apply all Wells in this object
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
#endif
#if HAVE_OPENCL
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int *toOrder, bool reorder);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
#endif
unsigned int getNumWells(){
return num_std_wells + num_ms_wells;
}
@ -173,13 +93,8 @@ public:
/// Allocate memory for the StandardWells
void alloc();
/// Create a new WellContributions
/// \param[in] accelerator_mode string indicating which solverBackend is used
/// \param[in] useWellConn true iff wellcontributions are added to the matrix
WellContributions(std::string accelerator_mode, bool useWellConn);
/// Destroy a WellContributions, and free memory
~WellContributions();
/// Empty destructor.
virtual ~WellContributions() = default;
/// Indicate how large the blocks of the StandardWell (C and B) are
/// \param[in] dim number of columns
@ -212,6 +127,12 @@ public:
unsigned int DnumBlocks, double *Dvalues,
UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
std::vector<double> &Cvalues);
protected:
//! \brief API specific allocation.
virtual void APIalloc() {}
/// Api specific upload of matrix.
virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
};
} //namespace Opm

View File

@ -22,12 +22,13 @@
#include <cstdlib>
#include <cstring>
#include "opm/simulators/linalg/bda/cuWellContributions.hpp"
#include "opm/simulators/linalg/bda/cuda_header.hpp"
#include <cuda_runtime.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include "opm/simulators/linalg/bda/WellContributions.hpp"
namespace Opm
{
@ -125,18 +126,8 @@ __global__ void apply_well_contributions(
}
void WellContributions::allocStandardWells()
WellContributionsCuda::~WellContributionsCuda()
{
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
void WellContributions::freeCudaMemory() {
// delete data for StandardWell
if (num_std_wells > 0) {
cudaFree(d_Cnnzs);
@ -154,10 +145,20 @@ void WellContributions::freeCudaMemory() {
}
}
void WellContributionsCuda::APIalloc()
{
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
// Apply the WellContributions, similar to StandardWell::apply()
// y -= (C^T *(D^-1*( B*x)))
void WellContributions::apply(double *d_x, double *d_y)
void WellContributionsCuda::apply(double *d_x, double *d_y)
{
// apply MultisegmentWells
@ -194,7 +195,7 @@ void WellContributions::apply(double *d_x, double *d_y)
}
void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
{
switch (type) {
case MatrixType::C:
@ -219,7 +220,7 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v
cudaCheckLastError("WellContributions::addMatrix() failed");
}
void WellContributions::setCudaStream(cudaStream_t stream_)
void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
{
this->stream = stream_;
for (auto& well : multisegments) {

View File

@ -0,0 +1,78 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <cuda_runtime.h>
#include <memory>
#include <vector>
namespace Opm
{
class WellContributionsCuda : public WellContributions
{
public:
~WellContributionsCuda() override;
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
void setCudaStream(cudaStream_t stream);
/// Apply all Wells in this object
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
double* h_x = nullptr;
double* h_y = nullptr;
};
} //namespace Opm
#endif

View File

@ -26,6 +26,7 @@
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/cuda_header.hpp>
@ -72,7 +73,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
float it;
if (wellContribs.getNumWells() > 0) {
wellContribs.setCudaStream(stream);
static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
}
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -116,7 +117,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
wellContribs.apply(d_pw, d_v);
static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
}
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -147,7 +148,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
wellContribs.apply(d_s, d_t);
static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
}
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);

View File

@ -26,6 +26,7 @@
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
@ -262,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_pw, d_v, d_toOrder);
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v, d_toOrder);
}
t_well.stop();
@ -293,7 +294,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_s, d_t, d_toOrder);
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t, d_toOrder);
}
t_well.stop();
@ -520,10 +521,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
mat->nnzValues = vals;
if (opencl_ilu_reorder != ILUReorder::NONE) {
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
wellContribs.setReordering(toOrder, true);
static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
} else {
rb = b;
wellContribs.setReordering(nullptr, false);
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
}
if (verbosity > 2) {

View File

@ -0,0 +1,154 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h> // CMake
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#include <cstdlib>
#include <cstring>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
namespace Opm
{
using Accelerator::OpenclKernels;
void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
this->context = context_;
this->queue = queue_;
}
void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
{
this->h_toOrder = h_toOrder_;
this->reorder = reorder_;
}
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if (reorder) {
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
} else {
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
}
}
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
if (h_x.empty()) {
h_x.resize(N);
h_y.resize(N);
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
// actually apply MultisegmentWells
for (auto& well : multisegments) {
well->setReordering(h_toOrder, reorder);
well->apply(h_x.data(), h_y.data());
}
// copy vector y from CPU to GPU
events.resize(1);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if(num_std_wells > 0){
apply_stdwells(d_x, d_y, d_toOrder);
}
if(num_ms_wells > 0){
apply_mswells(d_x, d_y);
}
}
void WellContributionsOCL::APIaddMatrix(MatrixType type,
int* colIndices,
double* values,
unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
events.resize(2);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
break;
case MatrixType::D:
events.resize(1);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
events[0].wait();
events.clear();
break;
case MatrixType::B:
events.resize(2);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
events.resize(1);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributionsOCL::addMatrix()");
}
}
void WellContributionsOCL::APIalloc()
{
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
} //namespace Opm

View File

@ -0,0 +1,75 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <memory>
#include <vector>
namespace Opm
{
class WellContributionsOCL : public WellContributions
{
public:
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int* toOrder, bool reorder);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
cl::Context* context;
cl::CommandQueue* queue;
Opm::Accelerator::stdwell_apply_kernel_type* kernel;
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool reorder = false;
int *h_toOrder = nullptr;
std::vector<double> h_x;
std::vector<double> h_y;
};
} //namespace Opm
#endif

View File

@ -81,7 +81,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
Dune::InverseOperatorResult result;
Vector x(rhs.size());
Opm::WellContributions wellContribs("cusparse", false);
auto wellContribs = Opm::WellContributions::create("cusparse", false);
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
@ -89,7 +89,7 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
BOOST_WARN_MESSAGE(true, error.what());
throw DeviceInitException(error.what());
}
bridge->solve_system(&matrix, rhs, wellContribs, result);
bridge->solve_system(&matrix, rhs, *wellContribs, result);
bridge->get_result(x);
return x;

View File

@ -80,7 +80,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
Dune::InverseOperatorResult result;
Vector x(rhs.size());
Opm::WellContributions wellContribs("opencl", false);
auto wellContribs = Opm::WellContributions::create("opencl", false);
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
@ -88,7 +88,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
BOOST_WARN_MESSAGE(true, error.what());
throw PlatformInitException(error.what());
}
bridge->solve_system(&matrix, rhs, wellContribs, result);
bridge->solve_system(&matrix, rhs, *wellContribs, result);
bridge->get_result(x);
return x;