Merge pull request #3691 from akva2/bda_wellcontrib_refactor

Refactor BDA well contributions
This commit is contained in:
Markus Blatt 2021-11-15 10:26:44 +01:00 committed by GitHub
commit afc5ed751f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 399 additions and 291 deletions

View File

@ -49,6 +49,7 @@ list (APPEND MAIN_SOURCE_FILES
opm/simulators/linalg/FlexibleSolver6.cpp
opm/simulators/linalg/PropertyTree.cpp
opm/simulators/linalg/setupPropertyTree.cpp
opm/simulators/linalg/bda/WellContributions.cpp
opm/simulators/utils/PartiallySupportedFlowKeywords.cpp
opm/simulators/utils/readDeck.cpp
opm/simulators/utils/UnsupportedFlowKeywords.cpp
@ -93,8 +94,7 @@ list (APPEND MAIN_SOURCE_FILES
if(CUDA_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cusparseSolverBackend.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/cuWellContributions.cu)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
endif()
@ -106,6 +106,7 @@ if(OPENCL_FOUND)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/opencl.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclKernels.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclSolverBackend.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/openclWellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/BdaBridge.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/WellContributions.cpp)
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/bda/MultisegmentWellContribution.cpp)
@ -265,6 +266,7 @@ list (APPEND PUBLIC_HEADER_FILES
opm/simulators/linalg/bda/opencl.hpp
opm/simulators/linalg/bda/openclKernels.hpp
opm/simulators/linalg/bda/openclSolverBackend.hpp
opm/simulators/linalg/bda/openclWellContributions.hpp
opm/simulators/linalg/bda/MultisegmentWellContribution.hpp
opm/simulators/linalg/bda/WellContributions.hpp
opm/simulators/linalg/amgcpr.hh

View File

@ -262,18 +262,18 @@ namespace Opm
bool use_fpga = bdaBridge->getUseFpga();
if (use_gpu || use_fpga) {
const std::string accelerator_mode = EWOMS_GET_PARAM(TypeTag, std::string, AcceleratorMode);
WellContributions wellContribs(accelerator_mode, useWellConn_);
bdaBridge->initWellContributions(wellContribs);
auto wellContribs = WellContributions::create(accelerator_mode, useWellConn_);
bdaBridge->initWellContributions(*wellContribs);
// the WellContributions can only be applied separately with CUDA or OpenCL, not with an FPGA or amgcl
#if HAVE_CUDA || HAVE_OPENCL
if (!useWellConn_) {
simulator_.problem().wellModel().getWellContributions(wellContribs);
simulator_.problem().wellModel().getWellContributions(*wellContribs);
}
#endif
// Const_cast needed since the CUDA stuff overwrites values for better matrix condition..
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, wellContribs, result);
bdaBridge->solve_system(const_cast<Matrix*>(&getMatrix()), *rhs_, *wellContribs, result);
if (result.converged) {
// get result vector x from non-Dune backend, iff solve was successful
bdaBridge->get_result(x);

View File

@ -19,6 +19,9 @@
#include <config.h>
#include "dune/istl/bcrsmatrix.hh"
#include <opm/simulators/linalg/matrixblock.hh>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include <opm/material/common/Unused.hpp>
@ -32,6 +35,7 @@
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#endif
#if HAVE_FPGA
@ -270,7 +274,7 @@ void BdaBridge<BridgeMatrix, BridgeVector, block_size>::initWellContributions([[
if(accelerator_mode.compare("opencl") == 0){
#if HAVE_OPENCL
const auto openclBackend = static_cast<const Opm::Accelerator::openclSolverBackend<block_size>*>(backend.get());
wellContribs.setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
static_cast<WellContributionsOCL&>(wellContribs).setOpenCLEnv(openclBackend->context.get(), openclBackend->queue.get());
#else
OPM_THROW(std::logic_error, "Error openclSolver was chosen, but OpenCL was not found by CMake");
#endif

View File

@ -22,17 +22,9 @@
#include "dune/istl/solver.hh" // for struct InverseOperatorResult
#include "dune/istl/bcrsmatrix.hh"
#include <opm/simulators/linalg/matrixblock.hh>
#include <opm/simulators/linalg/bda/BdaSolver.hpp>
#include <opm/simulators/linalg/bda/ILUReorder.hpp>
#if HAVE_FPGA
#include <opm/simulators/linalg/bda/FPGASolverBackend.hpp>
#endif
namespace Opm
{

View File

@ -25,178 +25,68 @@
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#ifdef HAVE_OPENCL
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#endif
#ifdef HAVE_CUDA
#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
#endif
namespace Opm
{
#if HAVE_OPENCL
using Opm::Accelerator::OpenclKernels;
#endif
WellContributions::WellContributions(std::string accelerator_mode, bool useWellConn){
std::unique_ptr<WellContributions>
WellContributions::create(const std::string& accelerator_mode, bool useWellConn)
{
if(accelerator_mode.compare("cusparse") == 0){
cuda_gpu = true;
#if HAVE_CUDA
return std::make_unique<WellContributionsCuda>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: CUDA is not enabled");
#endif
}
else if(accelerator_mode.compare("opencl") == 0){
opencl_gpu = true;
#if HAVE_OPENCL
return std::make_unique<WellContributionsOCL>();
#else
OPM_THROW(std::runtime_error, "Cannot initialize well contributions: OpenCL is not enabled");
#endif
}
else if(accelerator_mode.compare("fpga") == 0){
// unused for FPGA, but must be defined to avoid error
return std::make_unique<WellContributions>();
}
else if(accelerator_mode.compare("amgcl") == 0){
if (!useWellConn) {
OPM_THROW(std::logic_error, "Error amgcl requires --matrix-add-well-contributions=true");
}
return std::make_unique<WellContributions>();
}
else{
OPM_THROW(std::logic_error, "Invalid accelerator mode");
}
}
WellContributions::~WellContributions()
void WellContributions::addMatrix([[maybe_unused]] MatrixType type,
[[maybe_unused]] int* colIndices,
[[maybe_unused]] double* values,
[[maybe_unused]] unsigned int val_size)
{
multisegments.clear();
#if HAVE_CUDA
if(cuda_gpu){
freeCudaMemory(); // should come before 'delete[] h_x'
}
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
#endif
#if HAVE_OPENCL
if(opencl_gpu){
if(num_ms_wells > 0){
delete[] h_x;
delete[] h_y;
}
}
#endif
}
#if HAVE_OPENCL
void WellContributions::setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_){
this->context = context_;
this->queue = queue_;
}
void WellContributions::setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_){
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
void WellContributions::setReordering(int *h_toOrder_, bool reorder_)
{
this->h_toOrder = h_toOrder_;
this->reorder = reorder_;
}
void WellContributions::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if (reorder) {
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
} else {
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
}
}
void WellContributions::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
if(h_x == nullptr){
h_x = new double[N];
h_y = new double[N];
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x, nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
// actually apply MultisegmentWells
for (auto& well : multisegments) {
well->setReordering(h_toOrder, reorder);
well->apply(h_x, h_y);
}
// copy vector y from CPU to GPU
events.resize(1);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y, nullptr, &events[0]);
events[0].wait();
events.clear();
}
void WellContributions::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if(num_std_wells > 0){
apply_stdwells(d_x, d_y, d_toOrder);
}
if(num_ms_wells > 0){
apply_mswells(d_x, d_y);
}
}
#endif
void WellContributions::addMatrix([[maybe_unused]] MatrixType type, [[maybe_unused]] int *colIndices, [[maybe_unused]] double *values, [[maybe_unused]] unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
#if HAVE_CUDA
if(cuda_gpu){
addMatrixGpu(type, colIndices, values, val_size);
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
switch (type) {
case MatrixType::C:
events.resize(2);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
break;
case MatrixType::D:
events.resize(1);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
events[0].wait();
events.clear();
break;
case MatrixType::B:
events.resize(2);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
events.resize(1);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributions::addMatrix()");
}
}
#endif
this->APIaddMatrix(type, colIndices, values, val_size);
if(MatrixType::B == type) {
num_blocks_so_far += val_size;
num_std_wells_so_far++;
}
#if !HAVE_CUDA && !HAVE_OPENCL
OPM_THROW(std::logic_error, "Error cannot add StandardWell matrix on GPU because neither CUDA nor OpenCL were found by cmake");
#endif
}
void WellContributions::setBlockSize(unsigned int dim_, unsigned int dim_wells_)
@ -225,22 +115,7 @@ void WellContributions::alloc()
if (num_std_wells > 0) {
val_pointers.resize(num_std_wells+1);
#if HAVE_CUDA
if(cuda_gpu){
allocStandardWells();
}
#endif
#if HAVE_OPENCL
if(opencl_gpu){
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
#endif
this->APIalloc();
allocated = true;
}
}

View File

@ -20,15 +20,6 @@
#ifndef WELLCONTRIBUTIONS_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_HEADER_INCLUDED
#if HAVE_CUDA
#include <cuda_runtime.h>
#endif
#if HAVE_OPENCL
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#endif
#include <memory>
#include <vector>
@ -61,6 +52,8 @@ namespace Opm
class WellContributions
{
public:
static std::unique_ptr<WellContributions> create(const std::string& accelerator_mode, bool useWellConn);
#if DUNE_VERSION_NEWER(DUNE_ISTL, 2, 7)
using UMFPackIndex = SuiteSparse_long;
#else
@ -73,9 +66,7 @@ public:
B
};
private:
bool opencl_gpu = false;
bool cuda_gpu = false;
protected:
bool allocated = false;
unsigned int N; // number of rows (not blockrows) in vectors x and y
@ -88,80 +79,9 @@ private:
unsigned int num_std_wells_so_far = 0; // keep track of where next data is written
std::vector<unsigned int> val_pointers; // val_pointers[wellID] == index of first block for this well in Ccols and Bcols
double *h_x = nullptr;
double *h_y = nullptr;
std::vector<std::unique_ptr<MultisegmentWellContribution>> multisegments;
#if HAVE_OPENCL
cl::Context *context;
cl::CommandQueue *queue;
Opm::Accelerator::stdwell_apply_kernel_type *kernel;
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool reorder = false;
int *h_toOrder = nullptr;
#endif
#if HAVE_CUDA
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
/// Allocate GPU memory for StandardWells
void allocStandardWells();
/// Free GPU memory allocated with cuda.
void freeCudaMemory();
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size);
#endif
public:
#if HAVE_CUDA
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
void setCudaStream(cudaStream_t stream);
/// Apply all Wells in this object
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
#endif
#if HAVE_OPENCL
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int *toOrder, bool reorder);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
#endif
unsigned int getNumWells(){
return num_std_wells + num_ms_wells;
}
@ -173,13 +93,8 @@ public:
/// Allocate memory for the StandardWells
void alloc();
/// Create a new WellContributions
/// \param[in] accelerator_mode string indicating which solverBackend is used
/// \param[in] useWellConn true iff wellcontributions are added to the matrix
WellContributions(std::string accelerator_mode, bool useWellConn);
/// Destroy a WellContributions, and free memory
~WellContributions();
/// Empty destructor.
virtual ~WellContributions() = default;
/// Indicate how large the blocks of the StandardWell (C and B) are
/// \param[in] dim number of columns
@ -212,6 +127,12 @@ public:
unsigned int DnumBlocks, double *Dvalues,
UMFPackIndex *DcolPointers, UMFPackIndex *DrowIndices,
std::vector<double> &Cvalues);
protected:
//! \brief API specific allocation.
virtual void APIalloc() {}
/// Api specific upload of matrix.
virtual void APIaddMatrix(MatrixType, int*, double*, unsigned int) {}
};
} //namespace Opm

View File

@ -22,12 +22,13 @@
#include <cstdlib>
#include <cstring>
#include "opm/simulators/linalg/bda/cuWellContributions.hpp"
#include "opm/simulators/linalg/bda/cuda_header.hpp"
#include <cuda_runtime.h>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
#include "opm/simulators/linalg/bda/WellContributions.hpp"
namespace Opm
{
@ -125,18 +126,8 @@ __global__ void apply_well_contributions(
}
void WellContributions::allocStandardWells()
WellContributionsCuda::~WellContributionsCuda()
{
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
void WellContributions::freeCudaMemory() {
// delete data for StandardWell
if (num_std_wells > 0) {
cudaFree(d_Cnnzs);
@ -154,10 +145,20 @@ void WellContributions::freeCudaMemory() {
}
}
void WellContributionsCuda::APIalloc()
{
cudaMalloc((void**)&d_Cnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Dnnzs, sizeof(double) * num_std_wells * dim_wells * dim_wells);
cudaMalloc((void**)&d_Bnnzs, sizeof(double) * num_blocks * dim * dim_wells);
cudaMalloc((void**)&d_Ccols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_Bcols, sizeof(int) * num_blocks);
cudaMalloc((void**)&d_val_pointers, sizeof(unsigned int) * (num_std_wells + 1));
cudaCheckLastError("apply_gpu malloc failed");
}
// Apply the WellContributions, similar to StandardWell::apply()
// y -= (C^T *(D^-1*( B*x)))
void WellContributions::apply(double *d_x, double *d_y)
void WellContributionsCuda::apply(double *d_x, double *d_y)
{
// apply MultisegmentWells
@ -194,7 +195,7 @@ void WellContributions::apply(double *d_x, double *d_y)
}
void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *values, unsigned int val_size)
void WellContributionsCuda::APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size)
{
switch (type) {
case MatrixType::C:
@ -219,7 +220,7 @@ void WellContributions::addMatrixGpu(MatrixType type, int *colIndices, double *v
cudaCheckLastError("WellContributions::addMatrix() failed");
}
void WellContributions::setCudaStream(cudaStream_t stream_)
void WellContributionsCuda::setCudaStream(cudaStream_t stream_)
{
this->stream = stream_;
for (auto& well : multisegments) {

View File

@ -0,0 +1,78 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_CUDA_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <cuda_runtime.h>
#include <memory>
#include <vector>
namespace Opm
{
class WellContributionsCuda : public WellContributions
{
public:
~WellContributionsCuda() override;
/// Set a cudaStream to be used
/// \param[in] stream the cudaStream that is used to launch the kernel in
void setCudaStream(cudaStream_t stream);
/// Apply all Wells in this object
/// performs y -= (C^T * (D^-1 * (B*x))) for all Wells
/// \param[in] d_x vector x, must be on GPU
/// \param[inout] d_y vector y, must be on GPU
void apply(double *d_x, double *d_y);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
/// Store a matrix in this object, in blocked csr format, can only be called after alloc() is called
/// \param[in] type indicate if C, D or B is sent
/// \param[in] colIndices columnindices of blocks in C or B, ignored for D
/// \param[in] values array of nonzeroes
/// \param[in] val_size number of blocks in C or B, ignored for D
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
cudaStream_t stream;
// data for StandardWells, could remain nullptrs if not used
double *d_Cnnzs = nullptr;
double *d_Dnnzs = nullptr;
double *d_Bnnzs = nullptr;
int *d_Ccols = nullptr;
int *d_Bcols = nullptr;
double *d_z1 = nullptr;
double *d_z2 = nullptr;
unsigned int *d_val_pointers = nullptr;
double* h_x = nullptr;
double* h_y = nullptr;
};
} //namespace Opm
#endif

View File

@ -26,6 +26,7 @@
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/cusparseSolverBackend.hpp>
#include <opm/simulators/linalg/bda/cuWellContributions.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/cuda_header.hpp>
@ -72,7 +73,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
float it;
if (wellContribs.getNumWells() > 0) {
wellContribs.setCudaStream(stream);
static_cast<WellContributionsCuda&>(wellContribs).setCudaStream(stream);
}
cusparseDbsrmv(cusparseHandle, order, operation, Nb, Nb, nnzb, &one, descr_M, d_bVals, d_bRows, d_bCols, block_size, d_x, &zero, d_r);
@ -116,7 +117,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
wellContribs.apply(d_pw, d_v);
static_cast<WellContributionsCuda&>(wellContribs).apply(d_pw, d_v);
}
cublasDdot(cublasHandle, n, d_rw, 1, d_v, 1, &tmp1);
@ -147,7 +148,7 @@ void cusparseSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellCon
// apply wellContributions
if (wellContribs.getNumWells() > 0) {
wellContribs.apply(d_s, d_t);
static_cast<WellContributionsCuda&>(wellContribs).apply(d_s, d_t);
}
cublasDdot(cublasHandle, n, d_t, 1, d_r, 1, &tmp1);

View File

@ -26,6 +26,7 @@
#include <dune/common/timer.hh>
#include <opm/simulators/linalg/bda/openclSolverBackend.hpp>
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#include <opm/simulators/linalg/bda/BdaResult.hpp>
#include <opm/simulators/linalg/bda/Reorder.hpp>
@ -262,7 +263,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_pw, d_v, d_toOrder);
static_cast<WellContributionsOCL&>(wellContribs).apply(d_pw, d_v, d_toOrder);
}
t_well.stop();
@ -293,7 +294,7 @@ void openclSolverBackend<block_size>::gpu_pbicgstab(WellContributions& wellContr
// apply wellContributions
t_well.start();
if(wellContribs.getNumWells() > 0){
wellContribs.apply(d_s, d_t, d_toOrder);
static_cast<WellContributionsOCL&>(wellContribs).apply(d_s, d_t, d_toOrder);
}
t_well.stop();
@ -520,10 +521,10 @@ void openclSolverBackend<block_size>::update_system(double *vals, double *b, Wel
mat->nnzValues = vals;
if (opencl_ilu_reorder != ILUReorder::NONE) {
reorderBlockedVectorByPattern<block_size>(mat->Nb, b, fromOrder, rb);
wellContribs.setReordering(toOrder, true);
static_cast<WellContributionsOCL&>(wellContribs).setReordering(toOrder, true);
} else {
rb = b;
wellContribs.setReordering(nullptr, false);
static_cast<WellContributionsOCL&>(wellContribs).setReordering(nullptr, false);
}
if (verbosity > 2) {

View File

@ -0,0 +1,154 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h> // CMake
#include <opm/simulators/linalg/bda/openclWellContributions.hpp>
#include <cstdlib>
#include <cstring>
#include <opm/common/OpmLog/OpmLog.hpp>
#include <opm/common/ErrorMacros.hpp>
namespace Opm
{
using Accelerator::OpenclKernels;
void WellContributionsOCL::setOpenCLEnv(cl::Context* context_, cl::CommandQueue* queue_) {
this->context = context_;
this->queue = queue_;
}
void WellContributionsOCL::setKernel(Accelerator::stdwell_apply_kernel_type* kernel_,
Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder_) {
this->kernel = kernel_;
this->kernel_no_reorder = kernel_no_reorder_;
}
void WellContributionsOCL::setReordering(int* h_toOrder_, bool reorder_)
{
this->h_toOrder = h_toOrder_;
this->reorder = reorder_;
}
void WellContributionsOCL::apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if (reorder) {
OpenclKernels::apply_stdwells_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, d_toOrder, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
} else {
OpenclKernels::apply_stdwells_no_reorder(*d_Cnnzs_ocl, *d_Dnnzs_ocl, *d_Bnnzs_ocl, *d_Ccols_ocl, *d_Bcols_ocl,
d_x, d_y, dim, dim_wells, *d_val_pointers_ocl, num_std_wells);
}
}
void WellContributionsOCL::apply_mswells(cl::Buffer d_x, cl::Buffer d_y){
if (h_x.empty()) {
h_x.resize(N);
h_y.resize(N);
}
events.resize(2);
queue->enqueueReadBuffer(d_x, CL_FALSE, 0, sizeof(double) * N, h_x.data(), nullptr, &events[0]);
queue->enqueueReadBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
// actually apply MultisegmentWells
for (auto& well : multisegments) {
well->setReordering(h_toOrder, reorder);
well->apply(h_x.data(), h_y.data());
}
// copy vector y from CPU to GPU
events.resize(1);
queue->enqueueWriteBuffer(d_y, CL_FALSE, 0, sizeof(double) * N, h_y.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
void WellContributionsOCL::apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder){
if(num_std_wells > 0){
apply_stdwells(d_x, d_y, d_toOrder);
}
if(num_ms_wells > 0){
apply_mswells(d_x, d_y);
}
}
void WellContributionsOCL::APIaddMatrix(MatrixType type,
int* colIndices,
double* values,
unsigned int val_size)
{
if (!allocated) {
OPM_THROW(std::logic_error, "Error cannot add wellcontribution before allocating memory in WellContributions");
}
switch (type) {
case MatrixType::C:
events.resize(2);
queue->enqueueWriteBuffer(*d_Cnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Ccols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
break;
case MatrixType::D:
events.resize(1);
queue->enqueueWriteBuffer(*d_Dnnzs_ocl, CL_FALSE, sizeof(double) * num_std_wells_so_far * dim_wells * dim_wells, sizeof(double) * dim_wells * dim_wells, values, nullptr, &events[0]);
events[0].wait();
events.clear();
break;
case MatrixType::B:
events.resize(2);
queue->enqueueWriteBuffer(*d_Bnnzs_ocl, CL_FALSE, sizeof(double) * num_blocks_so_far * dim * dim_wells, sizeof(double) * val_size * dim * dim_wells, values, nullptr, &events[0]);
queue->enqueueWriteBuffer(*d_Bcols_ocl, CL_FALSE, sizeof(int) * num_blocks_so_far, sizeof(int) * val_size, colIndices, nullptr, &events[1]);
cl::WaitForEvents(events);
events.clear();
val_pointers[num_std_wells_so_far] = num_blocks_so_far;
if (num_std_wells_so_far == num_std_wells - 1) {
val_pointers[num_std_wells] = num_blocks;
events.resize(1);
queue->enqueueWriteBuffer(*d_val_pointers_ocl, CL_FALSE, 0, sizeof(unsigned int) * (num_std_wells + 1), val_pointers.data(), nullptr, &events[0]);
events[0].wait();
events.clear();
}
break;
default:
OPM_THROW(std::logic_error, "Error unsupported matrix ID for WellContributionsOCL::addMatrix()");
}
}
void WellContributionsOCL::APIalloc()
{
d_Cnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Dnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_std_wells * dim_wells * dim_wells);
d_Bnnzs_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(double) * num_blocks * dim * dim_wells);
d_Ccols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_Bcols_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(int) * num_blocks);
d_val_pointers_ocl = std::make_unique<cl::Buffer>(*context, CL_MEM_READ_WRITE, sizeof(unsigned int) * (num_std_wells + 1));
}
} //namespace Opm

View File

@ -0,0 +1,75 @@
/*
Copyright 2020 Equinor ASA
This file is part of the Open Porous Media project (OPM).
OPM is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
OPM is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with OPM. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
#define WELLCONTRIBUTIONS_OPENCL_HEADER_INCLUDED
#include <opm/simulators/linalg/bda/WellContributions.hpp>
#include <opm/simulators/linalg/bda/opencl.hpp>
#include <opm/simulators/linalg/bda/openclKernels.hpp>
#include <memory>
#include <vector>
namespace Opm
{
class WellContributionsOCL : public WellContributions
{
public:
void setKernel(Opm::Accelerator::stdwell_apply_kernel_type *kernel_,
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type *kernel_no_reorder_);
void setOpenCLEnv(cl::Context *context_, cl::CommandQueue *queue_);
/// Since the rows of the matrix are reordered, the columnindices of the matrixdata is incorrect
/// Those indices need to be mapped via toOrder
/// \param[in] toOrder array with mappings
/// \param[in] reorder whether reordering is actually used or not
void setReordering(int* toOrder, bool reorder);
void apply_stdwells(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
void apply_mswells(cl::Buffer d_x, cl::Buffer d_y);
void apply(cl::Buffer d_x, cl::Buffer d_y, cl::Buffer d_toOrder);
protected:
/// Allocate memory for the StandardWells
void APIalloc() override;
void APIaddMatrix(MatrixType type, int *colIndices, double *values, unsigned int val_size) override;
cl::Context* context;
cl::CommandQueue* queue;
Opm::Accelerator::stdwell_apply_kernel_type* kernel;
Opm::Accelerator::stdwell_apply_no_reorder_kernel_type* kernel_no_reorder;
std::vector<cl::Event> events;
std::unique_ptr<cl::Buffer> d_Cnnzs_ocl, d_Dnnzs_ocl, d_Bnnzs_ocl;
std::unique_ptr<cl::Buffer> d_Ccols_ocl, d_Bcols_ocl;
std::unique_ptr<cl::Buffer> d_val_pointers_ocl;
bool reorder = false;
int *h_toOrder = nullptr;
std::vector<double> h_x;
std::vector<double> h_y;
};
} //namespace Opm
#endif

View File

@ -81,18 +81,22 @@ testCusparseSolver(const boost::property_tree::ptree& prm, const std::string& ma
Dune::InverseOperatorResult result;
Vector x(rhs.size());
Opm::WellContributions wellContribs("cusparse", false);
auto wellContribs = Opm::WellContributions::create("cusparse", false);
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
bridge->solve_system(&matrix, rhs, *wellContribs, result);
bridge->get_result(x);
return x;
} catch (const std::logic_error& error) {
BOOST_WARN_MESSAGE(true, error.what());
throw DeviceInitException(error.what());
if (strstr(error.what(), "Could not get device") != nullptr)
throw DeviceInitException(error.what());
else
throw error;
}
bridge->solve_system(&matrix, rhs, wellContribs, result);
bridge->get_result(x);
return x;
}
namespace pt = boost::property_tree;

View File

@ -80,7 +80,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
Dune::InverseOperatorResult result;
Vector x(rhs.size());
Opm::WellContributions wellContribs("opencl", false);
auto wellContribs = Opm::WellContributions::create("opencl", false);
std::unique_ptr<Opm::BdaBridge<Matrix, Vector, bz> > bridge;
try {
bridge = std::make_unique<Opm::BdaBridge<Matrix, Vector, bz> >(gpu_mode, fpga_bitstream, linear_solver_verbosity, maxit, tolerance, platformID, deviceID, opencl_ilu_reorder);
@ -88,7 +88,7 @@ testOpenclSolver(const boost::property_tree::ptree& prm, const std::string& matr
BOOST_WARN_MESSAGE(true, error.what());
throw PlatformInitException(error.what());
}
bridge->solve_system(&matrix, rhs, wellContribs, result);
bridge->solve_system(&matrix, rhs, *wellContribs, result);
bridge->get_result(x);
return x;